Преглед изворни кода

HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly created stream is equal to start of split. Contributed by Ankit Kamboj
(cherry picked from commit d02fb53750bc592c23ba470ae82eb6f47d9a00ec)

Jason Lowe пре 10 година
родитељ
комит
2b408d8dc7

+ 3 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -322,6 +322,9 @@ Release 2.7.0 - UNRELEASED
     HADOOP-11459. Fix recent findbugs in ActiveStandbyElector, NetUtils
     and ShellBasedIdMapping (vinayakumarb)
 
+    HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly
+    created stream is equal to start of split (Ankit Kamboj via jlowe)
+
 Release 2.6.0 - 2014-11-18
 
   INCOMPATIBLE CHANGES

+ 1 - 1
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java

@@ -225,7 +225,7 @@ public class BZip2Codec implements Configurable, SplittableCompressionCodec {
     // ........................................^^[We align at wrong position!]
     // ...........................................................^^[While this pos is correct]
 
-    if (in.getPos() <= start) {
+    if (in.getPos() < start) {
       ((Seekable)seekableIn).seek(start);
       in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
     }

+ 21 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java

@@ -106,6 +106,27 @@ public class TestLineRecordReader {
     testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498);
   }
 
+  //This test ensures record reader doesn't lose records when it starts
+  //exactly at the starting byte of a bz2 compressed block
+  @Test
+  public void testBzip2SplitStartAtBlockMarker() throws IOException {
+    //136504 in blockEndingInCR.txt.bz2 is the byte at which the bz2 block ends
+    //In the following test cases record readers should iterate over all the records
+    //and should not miss any record.
+
+    //Start next split at just the start of the block.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136504);
+
+    //Start next split a byte forward in next block.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136505);
+
+    //Start next split 3 bytes forward in next block.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136508);
+
+    //Start next split 10 bytes from behind the end marker.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136494);
+  }
+
   // Use the LineRecordReader to read records from the file
   public ArrayList<String> readRecords(URL testFileUrl, int splitSize)
       throws IOException {