17 年之前 · e013d15ab7
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -86,6 +86,11 @@ Trunk (unreleased changes)
 
				     be corrupt, retain all copies and mark the block as corrupt.
			
 
				     (Lohit Vjayarenu via rangadi)
			
 
				 
			
 
				+    HADOOP-3221. Adds org.apache.hadoop.mapred.lib.NLineInputFormat, which 
			
 
				+    splits files into splits each of N lines. N can be specified by 
			
 
				+    configuration property "mapred.line.input.format.linespermap", which
			
 
				+    defaults to 1. (Amareshwari Sriramadasu via ddas) 
			
 
				+
			
 
				   IMPROVEMENTS
			
 
				    
			
 
				     HADOOP-2928. Remove deprecated FileSystem.getContentLength().
			
--- a/conf/hadoop-default.xml
+++ b/conf/hadoop-default.xml
@@ -1057,6 +1057,13 @@ creations/deletions), or "all".</description>
 
				     </description>
			
 
				   </property>
			
 
				 
			
 
				+  <property>
			
 
				+    <name>mapred.line.input.format.linespermap</name>
			
 
				+    <value>1</value>
			
 
				+    <description> Number of lines per split in NLineInputFormat.
			
 
				+    </description>
			
 
				+  </property>
			
 
				+
			
 
				 <!-- ipc properties -->
			
 
				 
			
 
				 <property>
			
--- a/src/java/org/apache/hadoop/mapred/lib/NLineInputFormat.java
+++ b/src/java/org/apache/hadoop/mapred/lib/NLineInputFormat.java
@@ -0,0 +1,124 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.mapred.lib;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.ArrayList;
			
 
				+
			
 
				+import org.apache.hadoop.fs.FSDataInputStream;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.io.LongWritable;
			
 
				+import org.apache.hadoop.io.Text;
			
 
				+import org.apache.hadoop.mapred.FileInputFormat;
			
 
				+import org.apache.hadoop.mapred.FileSplit;
			
 
				+import org.apache.hadoop.mapred.InputSplit;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.JobConfigurable;
			
 
				+import org.apache.hadoop.mapred.LineRecordReader;
			
 
				+import org.apache.hadoop.mapred.RecordReader;
			
 
				+import org.apache.hadoop.mapred.Reporter;
			
 
				+import org.apache.hadoop.mapred.LineRecordReader.LineReader;
			
 
				+
			
 
				+/**
			
 
				+ * NLineInputFormat which splits N lines of input as one split.
			
 
				+ *
			
 
				+ * In many "pleasantly" parallel applications, each process/mapper 
			
 
				+ * processes the same input file (s), but with computations are 
			
 
				+ * controlled by different parameters.(Referred to as "parameter sweeps").
			
 
				+ * One way to achieve this, is to specify a set of parameters 
			
 
				+ * (one set per line) as input in a control file 
			
 
				+ * (which is the input path to the map-reduce application,
			
 
				+ * where as the input dataset is specified 
			
 
				+ * via a config variable in JobConf.).
			
 
				+ * 
			
 
				+ * The NLineInputFormat can be used in such applications, that splits 
			
 
				+ * the input file such that by default, one line is fed as
			
 
				+ * a value to one map task, and key is the offset.
			
 
				+ * i.e. (k,v) is (LongWritable, Text).
			
 
				+ * The location hints will span the whole mapred cluster.
			
 
				+ */
			
 
				+
			
 
				+public class NLineInputFormat extends FileInputFormat<LongWritable, Text> 
			
 
				+                              implements JobConfigurable { 
			
 
				+  private int N = 1;
			
 
				+
			
 
				+  public RecordReader<LongWritable, Text> getRecordReader(
			
 
				+                                            InputSplit genericSplit,
			
 
				+                                            JobConf job,
			
 
				+                                            Reporter reporter) 
			
 
				+  throws IOException {
			
 
				+    reporter.setStatus(genericSplit.toString());
			
 
				+    return new LineRecordReader(job, (FileSplit) genericSplit);
			
 
				+  }
			
 
				+
			
 
				+  /** 
			
 
				+   * Logically splits the set of input files for the job, splits N lines
			
 
				+   * of the input as one split.
			
 
				+   * 
			
 
				+   * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int)
			
 
				+   */
			
 
				+  public InputSplit[] getSplits(JobConf job, int numSplits)
			
 
				+  throws IOException {
			
 
				+    ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
			
 
				+    Path[] files = listPaths(job);
			
 
				+    for (int i=0; i < files.length; i++) {
			
 
				+      Path fileName = files[i];
			
 
				+      FileSystem  fs = fileName.getFileSystem(job);
			
 
				+      FileStatus status = fs.getFileStatus(fileName);
			
 
				+      if (status.isDir() || !fs.exists(fileName)) {
			
 
				+        throw new IOException("Not a file: " + fileName);
			
 
				+      }
			
 
				+      LineReader lr = null;
			
 
				+      try {
			
 
				+        FSDataInputStream in  = fs.open(fileName);
			
 
				+        lr = new LineReader(in, job);
			
 
				+        Text line = new Text();
			
 
				+        int numLines = 0;
			
 
				+        long begin = 0;
			
 
				+        long length = 0;
			
 
				+        int num = -1;
			
 
				+        while ((num = lr.readLine(line)) > 0) {
			
 
				+          numLines++;
			
 
				+          length += num;
			
 
				+          if (numLines == N) {
			
 
				+            splits.add(new FileSplit(fileName, begin, length, new String[]{}));
			
 
				+            begin += length;
			
 
				+            length = 0;
			
 
				+            numLines = 0;
			
 
				+          }
			
 
				+        }
			
 
				+        if (numLines != 0) {
			
 
				+          splits.add(new FileSplit(fileName, begin, length, new String[]{}));
			
 
				+        }
			
 
				+   
			
 
				+      } finally {
			
 
				+        if (lr != null) {
			
 
				+          lr.close();
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    return splits.toArray(new FileSplit[splits.size()]);
			
 
				+  }
			
 
				+
			
 
				+  public void configure(JobConf conf) {
			
 
				+    N = conf.getInt("mapred.line.input.format.linespermap", 1);
			
 
				+  }
			
 
				+}
			
--- a/src/test/org/apache/hadoop/mapred/lib/TestLineInputFormat.java
+++ b/src/test/org/apache/hadoop/mapred/lib/TestLineInputFormat.java
@@ -0,0 +1,118 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.mapred.lib;
			
 
				+
			
 
				+import java.io.*;
			
 
				+import java.util.*;
			
 
				+import junit.framework.TestCase;
			
 
				+
			
 
				+import org.apache.hadoop.fs.*;
			
 
				+import org.apache.hadoop.io.*;
			
 
				+import org.apache.hadoop.mapred.*;
			
 
				+
			
 
				+public class TestLineInputFormat extends TestCase {
			
 
				+  private static int MAX_LENGTH = 200;
			
 
				+  
			
 
				+  private static JobConf defaultConf = new JobConf();
			
 
				+  private static FileSystem localFs = null; 
			
 
				+
			
 
				+  static {
			
 
				+    try {
			
 
				+      localFs = FileSystem.getLocal(defaultConf);
			
 
				+    } catch (IOException e) {
			
 
				+      throw new RuntimeException("init failure", e);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  private static Path workDir = 
			
 
				+    new Path(new Path(System.getProperty("test.build.data", "."), "data"),
			
 
				+             "TestLineInputFormat");
			
 
				+  
			
 
				+  public void testFormat() throws Exception {
			
 
				+    JobConf job = new JobConf();
			
 
				+    Path file = new Path(workDir, "test.txt");
			
 
				+
			
 
				+    int seed = new Random().nextInt();
			
 
				+    Random random = new Random(seed);
			
 
				+
			
 
				+    localFs.delete(workDir, true);
			
 
				+    FileInputFormat.setInputPaths(job, workDir);
			
 
				+    int numLinesPerMap = 5;
			
 
				+    job.setInt("mapred.line.input.format.linespermap", numLinesPerMap);
			
 
				+
			
 
				+    // for a variety of lengths
			
 
				+    for (int length = 0; length < MAX_LENGTH;
			
 
				+         length += random.nextInt(MAX_LENGTH/10) + 1) {
			
 
				+      // create a file with length entries
			
 
				+      Writer writer = new OutputStreamWriter(localFs.create(file));
			
 
				+      try {
			
 
				+        for (int i = 0; i < length; i++) {
			
 
				+          writer.write(Integer.toString(i));
			
 
				+          writer.write("\n");
			
 
				+        }
			
 
				+      } finally {
			
 
				+        writer.close();
			
 
				+      }
			
 
				+      checkFormat(job, numLinesPerMap);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // A reporter that does nothing
			
 
				+  private static final Reporter voidReporter = Reporter.NULL;
			
 
				+  
			
 
				+  void checkFormat(JobConf job, int expectedN) throws IOException{
			
 
				+    NLineInputFormat format = new NLineInputFormat();
			
 
				+    format.configure(job);
			
 
				+    int ignoredNumSplits = 1;
			
 
				+    InputSplit[] splits = format.getSplits(job, ignoredNumSplits);
			
 
				+
			
 
				+    // check all splits except last one
			
 
				+    int count = 0;
			
 
				+    for (int j = 0; j < splits.length -1; j++) {
			
 
				+      assertEquals("There are no split locations", 0,
			
 
				+                   splits[j].getLocations().length);
			
 
				+      RecordReader<LongWritable, Text> reader =
			
 
				+        format.getRecordReader(splits[j], job, voidReporter);
			
 
				+      Class readerClass = reader.getClass();
			
 
				+      assertEquals("reader class is LineRecordReader.",
			
 
				+                   LineRecordReader.class, readerClass);        
			
 
				+      LongWritable key = reader.createKey();
			
 
				+      Class keyClass = key.getClass();
			
 
				+      assertEquals("Key class is LongWritable.", LongWritable.class, keyClass);
			
 
				+      Text value = reader.createValue();
			
 
				+      Class valueClass = value.getClass();
			
 
				+      assertEquals("Value class is Text.", Text.class, valueClass);
			
 
				+         
			
 
				+      try {
			
 
				+        count = 0;
			
 
				+        while (reader.next(key, value)) {
			
 
				+          count++;
			
 
				+        }
			
 
				+      } finally {
			
 
				+        reader.close();
			
 
				+      }
			
 
				+      assertEquals("number of lines in split is " + expectedN ,
			
 
				+                   expectedN, count);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  public static void main(String[] args) throws Exception {
			
 
				+    new TestLineInputFormat().testFormat();
			
 
				+  }
			
 
				+}