16 年之前 · 1678463545
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -123,6 +123,9 @@ Trunk (unreleased changes)
 
				     it down by monitoring for cumulative memory usage across tasks.
			
 
				     (Vinod Kumar Vavilapalli via yhemanth)
			
 
				 
			
 
				+    HADOOP-4640. Adds an input format that can split lzo compressed
			
 
				+    text files. (johan)
			
 
				+
			
 
				   OPTIMIZATIONS
			
 
				 
			
 
				     HADOOP-3293. Fixes FileInputFormat to do provide locations for splits
			
--- a/src/core/org/apache/hadoop/io/compress/LzopCodec.java
+++ b/src/core/org/apache/hadoop/io/compress/LzopCodec.java
@@ -408,18 +408,21 @@ public class LzopCodec extends LzoCodec {
 
				     }
			
 
				 
			
 
				     public void close() throws IOException {
			
 
				+      byte[] b = new byte[4096];
			
 
				+      while (!decompressor.finished()) {
			
 
				+        decompressor.decompress(b, 0, b.length);
			
 
				+      }
			
 
				       super.close();
			
 
				       verifyChecksums();
			
 
				     }
			
 
				   }
			
 
				 
			
 
				-  protected static class LzopDecompressor extends LzoDecompressor {
			
 
				+  public static class LzopDecompressor extends LzoDecompressor {
			
 
				 
			
 
				     private EnumMap<DChecksum,Checksum> chkDMap =
			
 
				       new EnumMap<DChecksum,Checksum>(DChecksum.class);
			
 
				     private EnumMap<CChecksum,Checksum> chkCMap =
			
 
				       new EnumMap<CChecksum,Checksum>(CChecksum.class);
			
 
				-    private final int bufferSize;
			
 
				 
			
 
				     /**
			
 
				      * Create an LzoDecompressor with LZO1X strategy (the only lzo algorithm
			
@@ -427,9 +430,17 @@ public class LzopCodec extends LzoCodec {
 
				      */
			
 
				     public LzopDecompressor(int bufferSize) {
			
 
				       super(LzoDecompressor.CompressionStrategy.LZO1X_SAFE, bufferSize);
			
 
				-      this.bufferSize = bufferSize;
			
 
				     }
			
 
				 
			
 
				+    /**
			
 
				+     * Get the number of checksum implementations
			
 
				+     * the current lzo file uses.
			
 
				+     * @return Number of checksum implementations in use.
			
 
				+     */
			
 
				+    public int getChecksumsCount() {
			
 
				+      return this.chkCMap.size() + this.chkDMap.size();
			
 
				+    }
			
 
				+    
			
 
				     /**
			
 
				      * Given a set of decompressed and compressed checksums, 
			
 
				      */
			
--- a/src/mapred/org/apache/hadoop/mapred/LzoTextInputFormat.java
+++ b/src/mapred/org/apache/hadoop/mapred/LzoTextInputFormat.java
@@ -0,0 +1,385 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.mapred;
			
 
				+
			
 
				+import java.io.EOFException;
			
 
				+import java.io.IOException;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.HashMap;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+
			
 
				+import org.apache.commons.logging.Log;
			
 
				+import org.apache.commons.logging.LogFactory;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.FSDataInputStream;
			
 
				+import org.apache.hadoop.fs.FSDataOutputStream;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.fs.PathFilter;
			
 
				+import org.apache.hadoop.io.LongWritable;
			
 
				+import org.apache.hadoop.io.Text;
			
 
				+import org.apache.hadoop.io.compress.CompressionCodec;
			
 
				+import org.apache.hadoop.io.compress.CompressionCodecFactory;
			
 
				+import org.apache.hadoop.io.compress.LzopCodec;
			
 
				+import org.apache.hadoop.io.compress.LzopCodec.LzopDecompressor;
			
 
				+import org.apache.hadoop.util.LineReader;
			
 
				+
			
 
				+/**
			
 
				+ * An {@link InputFormat} for lzop compressed text files. Files are broken into
			
 
				+ * lines. Either linefeed or carriage-return are used to signal end of line.
			
 
				+ * Keys are the position in the file, and values are the line of text.
			
 
				+ */
			
 
				+public class LzoTextInputFormat extends FileInputFormat<LongWritable, Text>
			
 
				+    implements JobConfigurable {
			
 
				+
			
 
				+  private static final Log LOG
			
 
				+    = LogFactory.getLog(LzoTextInputFormat.class.getName());
			
 
				+  
			
 
				+  public static final String LZO_INDEX_SUFFIX = ".index";
			
 
				+
			
 
				+  public void configure(JobConf conf) {
			
 
				+    FileInputFormat.setInputPathFilter(conf, LzopFilter.class);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * We don't want to process the index files.
			
 
				+   */
			
 
				+  static class LzopFilter implements PathFilter {
			
 
				+    public boolean accept(Path path) {
			
 
				+      if (path.toString().endsWith(LZO_INDEX_SUFFIX)) {
			
 
				+        return false;
			
 
				+      }
			
 
				+      return true;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  protected boolean isSplitable(FileSystem fs, Path file) {
			
 
				+    Path indexFile = new Path(file.toString()
			
 
				+        + LzoTextInputFormat.LZO_INDEX_SUFFIX);
			
 
				+
			
 
				+    try {
			
 
				+      // can't split without the index
			
 
				+      return fs.exists(indexFile);
			
 
				+    } catch (IOException e) {
			
 
				+      LOG.warn("Could not check if index file exists", e);
			
 
				+      return false;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  public RecordReader<LongWritable, Text> getRecordReader(
			
 
				+      InputSplit genericSplit, JobConf job, Reporter reporter)
			
 
				+    throws IOException {
			
 
				+
			
 
				+    reporter.setStatus(genericSplit.toString());
			
 
				+    return new LzoLineRecordReader(job, (FileSplit) genericSplit);
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
			
 
				+    FileSplit[] splits = (FileSplit[]) super.getSplits(job, numSplits);
			
 
				+    // find new start/ends of the filesplit that aligns
			
 
				+    // with the lzo blocks
			
 
				+
			
 
				+    List<FileSplit> result = new ArrayList<FileSplit>();
			
 
				+    FileSystem fs = FileSystem.get(job);
			
 
				+
			
 
				+    Map<Path, LzoIndex> indexes = new HashMap<Path, LzoIndex>();
			
 
				+    for (int i = 0; i < splits.length; i++) {
			
 
				+      FileSplit fileSplit = splits[i];
			
 
				+      // load the index
			
 
				+      Path file = fileSplit.getPath();
			
 
				+      if (!indexes.containsKey(file)) {
			
 
				+        LzoIndex index = readIndex(file, fs);
			
 
				+        if (index.isEmpty()) {
			
 
				+          // keep it as is since we didn't find an index
			
 
				+          result.add(fileSplit);
			
 
				+          continue;
			
 
				+        }
			
 
				+
			
 
				+        indexes.put(file, index);
			
 
				+      }
			
 
				+
			
 
				+      LzoIndex index = indexes.get(file);
			
 
				+      long start = fileSplit.getStart();
			
 
				+      long end = start + fileSplit.getLength();
			
 
				+
			
 
				+      if (start != 0) {
			
 
				+        // find the next block position from
			
 
				+        // the start of the split
			
 
				+        long newStart = index.findNextPosition(start);
			
 
				+        if (newStart == -1 || newStart >= end) {
			
 
				+          // just skip this since it will be handled by another split
			
 
				+          continue;
			
 
				+        }
			
 
				+        start = newStart;
			
 
				+      }
			
 
				+
			
 
				+      long newEnd = index.findNextPosition(end);
			
 
				+      if (newEnd != -1) {
			
 
				+        end = newEnd;
			
 
				+      }
			
 
				+
			
 
				+      result.add(new FileSplit(file, start, end - start, fileSplit
			
 
				+          .getLocations()));
			
 
				+    }
			
 
				+
			
 
				+    return result.toArray(new FileSplit[] {});
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Read the index of the lzo file.
			
 
				+   * 
			
 
				+   * @param split Read the index of this file.
			
 
				+   * @param fs The index file is on this file system.
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private LzoIndex readIndex(Path file, FileSystem fs) throws IOException {
			
 
				+    FSDataInputStream indexIn = null;
			
 
				+    try {
			
 
				+      Path indexFile = new Path(file.toString() + LZO_INDEX_SUFFIX);
			
 
				+      if (!fs.exists(indexFile)) {
			
 
				+        // return empty index, fall back to the unsplittable mode
			
 
				+        return new LzoIndex();
			
 
				+      }
			
 
				+      
			
 
				+      long indexLen = fs.getFileStatus(indexFile).getLen();
			
 
				+      int blocks = (int) (indexLen / 8);
			
 
				+      LzoIndex index = new LzoIndex(blocks);
			
 
				+      indexIn = fs.open(indexFile);
			
 
				+      for (int i = 0; i < blocks; i++) {
			
 
				+        index.set(i, indexIn.readLong());
			
 
				+      }
			
 
				+      return index;
			
 
				+    } finally {
			
 
				+      if (indexIn != null) {
			
 
				+        indexIn.close();
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Index an lzo file to allow the input format to split them into separate map
			
 
				+   * jobs.
			
 
				+   * 
			
 
				+   * @param fs File system that contains the file.
			
 
				+   * @param lzoFile the lzo file to index.
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  public static void createIndex(FileSystem fs, Path lzoFile) 
			
 
				+    throws IOException {
			
 
				+    
			
 
				+    Configuration conf = fs.getConf();
			
 
				+    LzopCodec codec = new LzopCodec();
			
 
				+    codec.setConf(conf);
			
 
				+
			
 
				+    FSDataInputStream is = null;
			
 
				+    FSDataOutputStream os = null;
			
 
				+    try {
			
 
				+      is = fs.open(lzoFile);
			
 
				+      os = fs.create(new Path(lzoFile.toString()
			
 
				+          + LzoTextInputFormat.LZO_INDEX_SUFFIX));
			
 
				+      LzopDecompressor decompressor = (LzopDecompressor) codec
			
 
				+          .createDecompressor();
			
 
				+      // for reading the header
			
 
				+      codec.createInputStream(is, decompressor);
			
 
				+
			
 
				+      int numChecksums = decompressor.getChecksumsCount();
			
 
				+
			
 
				+      while (true) {
			
 
				+        //read and ignore, we just want to get to the next int
			
 
				+        int uncompressedBlockSize = is.readInt();
			
 
				+        if (uncompressedBlockSize == 0) {
			
 
				+          break;
			
 
				+        } else if (uncompressedBlockSize < 0) {
			
 
				+          throw new EOFException();
			
 
				+        }
			
 
				+        
			
 
				+        int compressedBlockSize = is.readInt();
			
 
				+        if (compressedBlockSize <= 0) {
			
 
				+          throw new IOException("Could not read compressed block size");
			
 
				+        }
			
 
				+
			
 
				+        long pos = is.getPos();
			
 
				+        // write the pos of the block start
			
 
				+        os.writeLong(pos - 8);
			
 
				+        // seek to the start of the next block, skip any checksums
			
 
				+        is.seek(pos + compressedBlockSize + (4 * numChecksums));
			
 
				+      }
			
 
				+    } finally {
			
 
				+      if (is != null) {
			
 
				+        is.close();
			
 
				+      }
			
 
				+
			
 
				+      if (os != null) {
			
 
				+        os.close();
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Represents the lzo index.
			
 
				+   */
			
 
				+  static class LzoIndex {
			
 
				+    
			
 
				+    private long[] blockPositions;
			
 
				+
			
 
				+    LzoIndex() {
			
 
				+    }   
			
 
				+    
			
 
				+    LzoIndex(int blocks) {
			
 
				+      blockPositions = new long[blocks];
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * Set the position for the block.
			
 
				+     * @param blockNumber Block to set pos for.
			
 
				+     * @param pos Position.
			
 
				+     */
			
 
				+    public void set(int blockNumber, long pos) {
			
 
				+      blockPositions[blockNumber] = pos;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * Find the next lzo block start from the given position.
			
 
				+     * @param pos The position to start looking from.
			
 
				+     * @return Either the start position of the block or -1 if 
			
 
				+     * it couldn't be found.
			
 
				+     */
			
 
				+    public long findNextPosition(long pos) {
			
 
				+      int block = Arrays.binarySearch(blockPositions, pos);
			
 
				+
			
 
				+      if(block >= 0) {
			
 
				+        //direct hit on a block start position
			
 
				+        return blockPositions[block];
			
 
				+      } else {
			
 
				+        block = Math.abs(block) - 1;
			
 
				+        if(block > blockPositions.length - 1) {
			
 
				+          return -1;
			
 
				+        }
			
 
				+        return blockPositions[block];
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    public boolean isEmpty() {
			
 
				+      return blockPositions == null || blockPositions.length == 0;
			
 
				+    }    
			
 
				+    
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Reads line from an lzo compressed text file. Treats keys as offset in file
			
 
				+   * and value as line.
			
 
				+   */
			
 
				+  static class LzoLineRecordReader implements RecordReader<LongWritable, Text> {
			
 
				+
			
 
				+    private CompressionCodecFactory compressionCodecs = null;
			
 
				+    private long start;
			
 
				+    private long pos;
			
 
				+    private long end;
			
 
				+    private LineReader in;
			
 
				+    private FSDataInputStream fileIn;
			
 
				+
			
 
				+    public LzoLineRecordReader(Configuration job, FileSplit split)
			
 
				+      throws IOException {
			
 
				+
			
 
				+      start = split.getStart();
			
 
				+      end = start + split.getLength();
			
 
				+      final Path file = split.getPath();
			
 
				+
			
 
				+      FileSystem fs = file.getFileSystem(job);
			
 
				+
			
 
				+      compressionCodecs = new CompressionCodecFactory(job);
			
 
				+      final CompressionCodec codec = compressionCodecs.getCodec(file);
			
 
				+      if (codec == null) {
			
 
				+        throw new IOException("No lzo codec found, cannot run");
			
 
				+      }
			
 
				+
			
 
				+      // open the file and seek to the start of the split
			
 
				+      fileIn = fs.open(split.getPath());
			
 
				+
			
 
				+      // creates input stream and also reads the file header
			
 
				+      in = new LineReader(codec.createInputStream(fileIn), job);
			
 
				+
			
 
				+      if (start != 0) {
			
 
				+        fileIn.seek(start);
			
 
				+
			
 
				+        // read and ignore the first line
			
 
				+        in.readLine(new Text());
			
 
				+        start = fileIn.getPos();
			
 
				+      }
			
 
				+
			
 
				+      this.pos = start;
			
 
				+    }
			
 
				+
			
 
				+    public LongWritable createKey() {
			
 
				+      return new LongWritable();
			
 
				+    }
			
 
				+
			
 
				+    public Text createValue() {
			
 
				+      return new Text();
			
 
				+    }
			
 
				+
			
 
				+    /** Read a line. */
			
 
				+    public synchronized boolean next(LongWritable key, Text value)
			
 
				+      throws IOException {
			
 
				+
			
 
				+      //since the lzop codec reads everything in lzo blocks
			
 
				+      //we can't stop if the pos == end
			
 
				+      //instead we wait for the next block to be read in when
			
 
				+      //pos will be > end
			
 
				+      while (pos <= end) {
			
 
				+        key.set(pos);
			
 
				+
			
 
				+        int newSize = in.readLine(value);
			
 
				+        if (newSize == 0) {
			
 
				+          return false;
			
 
				+        }
			
 
				+        pos = fileIn.getPos();
			
 
				+
			
 
				+        return true;
			
 
				+      }
			
 
				+
			
 
				+      return false;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Get the progress within the split.
			
 
				+     */
			
 
				+    public float getProgress() {
			
 
				+      if (start == end) {
			
 
				+        return 0.0f;
			
 
				+      } else {
			
 
				+        return Math.min(1.0f, (pos - start) / (float) (end - start));
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    public synchronized long getPos() throws IOException {
			
 
				+      return pos;
			
 
				+    }
			
 
				+
			
 
				+    public synchronized void close() throws IOException {
			
 
				+      if (in != null) {
			
 
				+        in.close();
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/test/org/apache/hadoop/mapred/TestLzoTextInputFormat.java
+++ b/src/test/org/apache/hadoop/mapred/TestLzoTextInputFormat.java
@@ -0,0 +1,222 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.security.MessageDigest;
			
 
				+import java.security.NoSuchAlgorithmException;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Random;
			
 
				+
			
 
				+import junit.framework.TestCase;
			
 
				+
			
 
				+import org.apache.commons.logging.Log;
			
 
				+import org.apache.commons.logging.LogFactory;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.io.LongWritable;
			
 
				+import org.apache.hadoop.io.Text;
			
 
				+import org.apache.hadoop.io.compress.LzopCodec;
			
 
				+import org.apache.hadoop.mapred.LzoTextInputFormat.LzoIndex;
			
 
				+import org.apache.hadoop.util.NativeCodeLoader;
			
 
				+
			
 
				+/**
			
 
				+ * Test the LzoTextInputFormat, make sure it splits the file properly and
			
 
				+ * returns the right data.
			
 
				+ */
			
 
				+public class TestLzoTextInputFormat extends TestCase {
			
 
				+
			
 
				+  private static final Log LOG
			
 
				+    = LogFactory.getLog(TestLzoTextInputFormat.class.getName());
			
 
				+  
			
 
				+  private MessageDigest md5;
			
 
				+  private String lzoFileName = "file";
			
 
				+  
			
 
				+  @Override
			
 
				+  protected void setUp() throws Exception {
			
 
				+    super.setUp();
			
 
				+    md5 = MessageDigest.getInstance("MD5");
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Make sure the lzo index class works as described.
			
 
				+   */
			
 
				+  public void testLzoIndex() {
			
 
				+    LzoIndex index = new LzoIndex();
			
 
				+    assertTrue(index.isEmpty());
			
 
				+    index = new LzoIndex(4);
			
 
				+    index.set(0, 0);
			
 
				+    index.set(1, 5);
			
 
				+    index.set(2, 10);
			
 
				+    index.set(3, 15);
			
 
				+    assertFalse(index.isEmpty());
			
 
				+    
			
 
				+    assertEquals(0, index.findNextPosition(-1));
			
 
				+    assertEquals(5, index.findNextPosition(1));
			
 
				+    assertEquals(5, index.findNextPosition(5));
			
 
				+    assertEquals(15, index.findNextPosition(11));
			
 
				+    assertEquals(15, index.findNextPosition(15));
			
 
				+    assertEquals(-1, index.findNextPosition(16));
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Index the file and make sure it splits properly.
			
 
				+   * @throws NoSuchAlgorithmException
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  public void testWithIndex() throws NoSuchAlgorithmException, IOException {
			
 
				+    runTest(true);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Don't index the file and make sure it can be processed anyway.
			
 
				+   * @throws NoSuchAlgorithmException
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  public void testWithoutIndex() throws NoSuchAlgorithmException, IOException {
			
 
				+    runTest(false);
			
 
				+  }
			
 
				+  
			
 
				+  private void runTest(boolean testWithIndex) throws IOException, 
			
 
				+    NoSuchAlgorithmException {
			
 
				+    
			
 
				+    if (!NativeCodeLoader.isNativeCodeLoaded()) {
			
 
				+      LOG.warn("Cannot run this test without the native lzo libraries");
			
 
				+      return;
			
 
				+    }
			
 
				+
			
 
				+    String attempt = "attempt_200707121733_0001_m_000000_0";
			
 
				+    Path workDir = new Path(new Path(new Path(System.getProperty(
			
 
				+        "test.build.data", "."), "data"), FileOutputCommitter.TEMP_DIR_NAME),
			
 
				+        "_" + attempt);
			
 
				+    Path outputDir = workDir.getParent().getParent();
			
 
				+
			
 
				+    JobConf conf = new JobConf();
			
 
				+    conf.set("mapred.task.id", attempt);
			
 
				+    conf.set("io.compression.codecs", LzopCodec.class.getName());
			
 
				+
			
 
				+    FileSystem localFs = FileSystem.getLocal(conf);
			
 
				+    localFs.delete(workDir, true);
			
 
				+    localFs.mkdirs(workDir);
			
 
				+    FileInputFormat.setInputPaths(conf, outputDir);
			
 
				+
			
 
				+    
			
 
				+    // create some input data
			
 
				+    byte[] expectedMd5 = createTestInput(outputDir, workDir, conf, localFs);
			
 
				+
			
 
				+    if(testWithIndex) {
			
 
				+      Path lzoFile = new Path(workDir, lzoFileName);
			
 
				+      LzoTextInputFormat.createIndex(localFs, new Path(lzoFile
			
 
				+        + new LzopCodec().getDefaultExtension()));
			
 
				+    }
			
 
				+    
			
 
				+    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
			
 
				+    inputFormat.configure(conf);
			
 
				+    
			
 
				+    //it's left in the work dir
			
 
				+    FileInputFormat.setInputPaths(conf, workDir);
			
 
				+
			
 
				+    int numSplits = 3;
			
 
				+    InputSplit[] is = inputFormat.getSplits(conf, numSplits);
			
 
				+    if(testWithIndex) {
			
 
				+      assertEquals(numSplits, is.length);
			
 
				+    } else {
			
 
				+      assertEquals(1, is.length);
			
 
				+    }
			
 
				+
			
 
				+    for (InputSplit inputSplit : is) {
			
 
				+      RecordReader<LongWritable, Text> rr = inputFormat.getRecordReader(
			
 
				+          inputSplit, conf, Reporter.NULL);
			
 
				+      LongWritable key = rr.createKey();
			
 
				+      Text value = rr.createValue();
			
 
				+
			
 
				+      while (rr.next(key, value)) {
			
 
				+        md5.update(value.getBytes(), 0, value.getLength());
			
 
				+      }
			
 
				+
			
 
				+      rr.close();
			
 
				+    }
			
 
				+
			
 
				+    assertTrue(Arrays.equals(expectedMd5, md5.digest()));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Creates an lzo file with random data. 
			
 
				+   * 
			
 
				+   * @param outputDir Output directory
			
 
				+   * @param workDir Work directory, this is where the file is written to
			
 
				+   * @param fs File system we're using
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private byte[] createTestInput(Path outputDir, Path workDir, JobConf conf,
			
 
				+      FileSystem fs) throws IOException {
			
 
				+
			
 
				+    RecordWriter<Text, Text> rw = null;
			
 
				+    
			
 
				+    md5.reset();
			
 
				+    
			
 
				+    try {
			
 
				+      TextOutputFormat<Text, Text> output = new TextOutputFormat<Text, Text>();
			
 
				+      TextOutputFormat.setCompressOutput(conf, true);
			
 
				+      TextOutputFormat.setOutputCompressorClass(conf, LzopCodec.class);
			
 
				+      TextOutputFormat.setOutputPath(conf, outputDir);
			
 
				+      TextOutputFormat.setWorkOutputPath(conf, workDir);
			
 
				+
			
 
				+      rw = output.getRecordWriter(null, conf, lzoFileName, Reporter.NULL);
			
 
				+
			
 
				+      //has to be enough data to create a couple of lzo blocks
			
 
				+      int charsToOutput = 10485760;
			
 
				+      char[] chars = "abcdefghijklmnopqrstuvwxyz\u00E5\u00E4\u00F6"
			
 
				+          .toCharArray();
			
 
				+
			
 
				+      Random r = new Random(System.currentTimeMillis());
			
 
				+      Text key = new Text();
			
 
				+      Text value = new Text();
			
 
				+      int charsMax = chars.length - 1;
			
 
				+      for (int i = 0; i < charsToOutput;) {
			
 
				+        i += fillText(chars, r, charsMax, key);
			
 
				+        i += fillText(chars, r, charsMax, value);
			
 
				+        rw.write(key, value);
			
 
				+        md5.update(key.getBytes(), 0, key.getLength());
			
 
				+        //text output format writes tab between the key and value
			
 
				+        md5.update("\t".getBytes("UTF-8")); 
			
 
				+        md5.update(value.getBytes(), 0, value.getLength());
			
 
				+      }
			
 
				+    } finally {
			
 
				+      if (rw != null) {
			
 
				+        rw.close(Reporter.NULL);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    byte[] result = md5.digest();
			
 
				+    md5.reset();
			
 
				+    return result;
			
 
				+  }
			
 
				+
			
 
				+  private int fillText(char[] chars, Random r, int charsMax, Text text) {
			
 
				+    StringBuilder sb = new StringBuilder();
			
 
				+    //get a reasonable string length 
			
 
				+    int stringLength = r.nextInt(charsMax * 2);
			
 
				+    for (int j = 0; j < stringLength; j++) {
			
 
				+      sb.append(chars[r.nextInt(charsMax)]);
			
 
				+    }
			
 
				+    text.set(sb.toString());
			
 
				+    return stringLength;
			
 
				+  }
			
 
				+
			
 
				+}