浏览代码

HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly created stream is equal to start of split. Contributed by Ankit Kamboj
(cherry picked from commit d02fb53750bc592c23ba470ae82eb6f47d9a00ec)

Jason Lowe 10 年之前
父节点
当前提交
2b408d8dc7

+ 3 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -322,6 +322,9 @@ Release 2.7.0 - UNRELEASED
     HADOOP-11459. Fix recent findbugs in ActiveStandbyElector, NetUtils
     HADOOP-11459. Fix recent findbugs in ActiveStandbyElector, NetUtils
     and ShellBasedIdMapping (vinayakumarb)
     and ShellBasedIdMapping (vinayakumarb)
 
 
+    HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly
+    created stream is equal to start of split (Ankit Kamboj via jlowe)
+
 Release 2.6.0 - 2014-11-18
 Release 2.6.0 - 2014-11-18
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 1 - 1
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java

@@ -225,7 +225,7 @@ public class BZip2Codec implements Configurable, SplittableCompressionCodec {
     // ........................................^^[We align at wrong position!]
     // ........................................^^[We align at wrong position!]
     // ...........................................................^^[While this pos is correct]
     // ...........................................................^^[While this pos is correct]
 
 
-    if (in.getPos() <= start) {
+    if (in.getPos() < start) {
       ((Seekable)seekableIn).seek(start);
       ((Seekable)seekableIn).seek(start);
       in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
       in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
     }
     }

+ 21 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java

@@ -106,6 +106,27 @@ public class TestLineRecordReader {
     testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498);
     testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498);
   }
   }
 
 
+  //This test ensures record reader doesn't lose records when it starts
+  //exactly at the starting byte of a bz2 compressed block
+  @Test
+  public void testBzip2SplitStartAtBlockMarker() throws IOException {
+    //136504 in blockEndingInCR.txt.bz2 is the byte at which the bz2 block ends
+    //In the following test cases record readers should iterate over all the records
+    //and should not miss any record.
+
+    //Start next split at just the start of the block.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136504);
+
+    //Start next split a byte forward in next block.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136505);
+
+    //Start next split 3 bytes forward in next block.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136508);
+
+    //Start next split 10 bytes from behind the end marker.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136494);
+  }
+
   // Use the LineRecordReader to read records from the file
   // Use the LineRecordReader to read records from the file
   public ArrayList<String> readRecords(URL testFileUrl, int splitSize)
   public ArrayList<String> readRecords(URL testFileUrl, int splitSize)
       throws IOException {
       throws IOException {