17 år sedan · bc930ea24c
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -191,6 +191,9 @@ Release 0.16.1 - Unreleased
 
				     HADOOP-2918.  Improve error logging so that dfs writes failure with
			
 
				     "No lease on file" can be diagnosed. (dhruba)
			
 
				 
			
 
				+    HADOOP-2923.  Add SequenceFileAsBinaryInputFormat, which was
			
 
				+    missed in the commit for HADOOP-2603. (cdouglas via omalley)
			
 
				+
			
 
				 Release 0.16.0 - 2008-02-07
			
 
				 
			
 
				   INCOMPATIBLE CHANGES
			
--- a/src/java/org/apache/hadoop/mapred/SequenceFileAsBinaryInputFormat.java
+++ b/src/java/org/apache/hadoop/mapred/SequenceFileAsBinaryInputFormat.java
@@ -0,0 +1,136 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.io.BytesWritable;
			
 
				+import org.apache.hadoop.io.DataOutputBuffer;
			
 
				+import org.apache.hadoop.io.SequenceFile;
			
 
				+import org.apache.hadoop.mapred.InputSplit;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.RecordReader;
			
 
				+import org.apache.hadoop.mapred.Reporter;
			
 
				+import org.apache.hadoop.mapred.SequenceFileInputFormat;
			
 
				+
			
 
				+/**
			
 
				+ * InputFormat reading keys, values from SequenceFiles in binary (raw)
			
 
				+ * format.
			
 
				+ */
			
 
				+public class SequenceFileAsBinaryInputFormat
			
 
				+    extends SequenceFileInputFormat<BytesWritable,BytesWritable> {
			
 
				+
			
 
				+  public SequenceFileAsBinaryInputFormat() {
			
 
				+    super();
			
 
				+  }
			
 
				+
			
 
				+  public RecordReader<BytesWritable,BytesWritable> getRecordReader(
			
 
				+      InputSplit split, JobConf job, Reporter reporter)
			
 
				+      throws IOException {
			
 
				+    return new SequenceFileAsBinaryRecordReader(job, (FileSplit)split);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Read records from a SequenceFile as binary (raw) bytes.
			
 
				+   */
			
 
				+  public static class SequenceFileAsBinaryRecordReader
			
 
				+      implements RecordReader<BytesWritable,BytesWritable> {
			
 
				+    private SequenceFile.Reader in;
			
 
				+    private long start;
			
 
				+    private long end;
			
 
				+    private boolean done = false;
			
 
				+    private DataOutputBuffer buffer = new DataOutputBuffer();
			
 
				+    private SequenceFile.ValueBytes vbytes;
			
 
				+
			
 
				+    public SequenceFileAsBinaryRecordReader(Configuration conf, FileSplit split)
			
 
				+        throws IOException {
			
 
				+      Path path = split.getPath();
			
 
				+      FileSystem fs = path.getFileSystem(conf);
			
 
				+      this.in = new SequenceFile.Reader(fs, path, conf);
			
 
				+      this.end = split.getStart() + split.getLength();
			
 
				+      vbytes = in.createValueBytes();
			
 
				+    }
			
 
				+
			
 
				+    public BytesWritable createKey() {
			
 
				+      return new BytesWritable();
			
 
				+    }
			
 
				+
			
 
				+    public BytesWritable createValue() {
			
 
				+      return new BytesWritable();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Retrieve the name of the key class for this SequenceFile.
			
 
				+     * @see org.apache.hadoop.io.SequenceFile.Reader#getKeyClassName
			
 
				+     */
			
 
				+    public String getKeyClassName() {
			
 
				+      return in.getKeyClassName();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Retrieve the name of the value class for this SequenceFile.
			
 
				+     * @see org.apache.hadoop.io.SequenceFile.Reader#getValueClassName
			
 
				+     */
			
 
				+    public String getValueClassName() {
			
 
				+      return in.getValueClassName();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Read raw bytes from a SequenceFile.
			
 
				+     */
			
 
				+    public synchronized boolean next(BytesWritable key, BytesWritable val)
			
 
				+        throws IOException {
			
 
				+      if (done) return false;
			
 
				+      long pos = in.getPosition();
			
 
				+      boolean eof = -1 == in.nextRawKey(buffer);
			
 
				+      if (!eof) {
			
 
				+        key.set(buffer.getData(), 0, buffer.getLength());
			
 
				+        buffer.reset();
			
 
				+        in.nextRawValue(vbytes);
			
 
				+        vbytes.writeUncompressedBytes(buffer);
			
 
				+        val.set(buffer.getData(), 0, buffer.getLength());
			
 
				+        buffer.reset();
			
 
				+      }
			
 
				+      return !(done = (eof || (pos >= end && in.syncSeen())));
			
 
				+    }
			
 
				+
			
 
				+    public long getPos() throws IOException {
			
 
				+      return in.getPosition();
			
 
				+    }
			
 
				+
			
 
				+    public void close() throws IOException {
			
 
				+      in.close();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Return the progress within the input split
			
 
				+     * @return 0.0 to 1.0 of the input byte range
			
 
				+     */
			
 
				+    public float getProgress() throws IOException {
			
 
				+      if (end == start) {
			
 
				+        return 0.0f;
			
 
				+      } else {
			
 
				+        return Math.min(1.0f, (float)((in.getPosition() - start) /
			
 
				+                                      (double)(end - start)));
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/test/org/apache/hadoop/mapred/TestSequenceFileAsBinaryInputFormat.java
+++ b/src/test/org/apache/hadoop/mapred/TestSequenceFileAsBinaryInputFormat.java
@@ -0,0 +1,99 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.mapred;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.Random;
			
 
				+
			
 
				+import org.apache.hadoop.fs.*;
			
 
				+import org.apache.hadoop.io.*;
			
 
				+import org.apache.hadoop.mapred.*;
			
 
				+
			
 
				+import junit.framework.TestCase;
			
 
				+import org.apache.commons.logging.*;
			
 
				+
			
 
				+public class TestSequenceFileAsBinaryInputFormat extends TestCase {
			
 
				+  private static final Log LOG = InputFormatBase.LOG;
			
 
				+  private static final int RECORDS = 10000;
			
 
				+
			
 
				+  public void testBinary() throws IOException {
			
 
				+    JobConf job = new JobConf();
			
 
				+    FileSystem fs = FileSystem.getLocal(job);
			
 
				+    Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
			
 
				+    Path file = new Path(dir, "testbinary.seq");
			
 
				+    Random r = new Random();
			
 
				+    long seed = r.nextLong();
			
 
				+    r.setSeed(seed);
			
 
				+
			
 
				+    fs.delete(dir);
			
 
				+    job.setInputPath(dir);
			
 
				+
			
 
				+    Text tkey = new Text();
			
 
				+    Text tval = new Text();
			
 
				+
			
 
				+    SequenceFile.Writer writer =
			
 
				+      new SequenceFile.Writer(fs, job, file, Text.class, Text.class);
			
 
				+    try {
			
 
				+      for (int i = 0; i < RECORDS; ++i) {
			
 
				+        tkey.set(Integer.toString(r.nextInt(), 36));
			
 
				+        tval.set(Long.toString(r.nextLong(), 36));
			
 
				+        writer.append(tkey, tval);
			
 
				+      }
			
 
				+    } finally {
			
 
				+      writer.close();
			
 
				+    }
			
 
				+
			
 
				+    InputFormat<BytesWritable,BytesWritable> bformat =
			
 
				+      new SequenceFileAsBinaryInputFormat();
			
 
				+
			
 
				+    int count = 0;
			
 
				+    r.setSeed(seed);
			
 
				+    BytesWritable bkey = new BytesWritable();
			
 
				+    BytesWritable bval = new BytesWritable();
			
 
				+    Text cmpkey = new Text();
			
 
				+    Text cmpval = new Text();
			
 
				+    DataInputBuffer buf = new DataInputBuffer();
			
 
				+    RecordReader<BytesWritable,BytesWritable> reader =
			
 
				+      bformat.getRecordReader(new FileSplit(file, 0,
			
 
				+            fs.getFileStatus(file).getLen(), job), job, Reporter.NULL);
			
 
				+    try {
			
 
				+      while (reader.next(bkey, bval)) {
			
 
				+        tkey.set(Integer.toString(r.nextInt(), 36));
			
 
				+        tval.set(Long.toString(r.nextLong(), 36));
			
 
				+        buf.reset(bkey.get(), bkey.getSize());
			
 
				+        cmpkey.readFields(buf);
			
 
				+        buf.reset(bval.get(), bval.getSize());
			
 
				+        cmpval.readFields(buf);
			
 
				+        assertTrue(
			
 
				+            "Keys don't match: " + "*" + cmpkey.toString() + ":" +
			
 
				+                                         tkey.toString() + "*",
			
 
				+            cmpkey.toString().equals(tkey.toString()));
			
 
				+        assertTrue(
			
 
				+            "Vals don't match: " + "*" + cmpval.toString() + ":" +
			
 
				+                                         tval.toString() + "*",
			
 
				+            cmpval.toString().equals(tval.toString()));
			
 
				+        ++count;
			
 
				+      }
			
 
				+    } finally {
			
 
				+      reader.close();
			
 
				+    }
			
 
				+    assertEquals("Some records not found", RECORDS, count);
			
 
				+  }
			
 
				+
			
 
				+}