Bladeren bron

HADOOP-13192. org.apache.hadoop.util.LineReader cannot handle multibyte delimiters correctly. Contributed by binde.

(cherry picked from commit fc6b50cc574e144fd314dea6c11987c6a384bfa6)
(cherry picked from commit 39ea0891d2b1369ec8c1ea4312489309e1a85227)
(cherry picked from commit e19cd05a878aafc94cc0de36ac6638d7977c6e6e)
Akira Ajisaka 9 jaren geleden
bovenliggende
commit
534cdc8420

+ 3 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -51,6 +51,9 @@ Release 2.6.5 - UNRELEASED
     HADOOP-13052. ChecksumFileSystem mishandles crc file permissions.
     (Daryn Sharp via kihwal)
 
+    HADOOP-13192. org.apache.hadoop.util.LineReader cannot handle multibyte
+    delimiters correctly. (binde via aajisaka)
+
 Release 2.6.4 - 2016-02-11
 
   INCOMPATIBLE CHANGES

+ 4 - 1
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java

@@ -318,7 +318,10 @@ public class LineReader implements Closeable {
             break;
           }
         } else if (delPosn != 0) {
-          bufferPosn--;
+          bufferPosn -= delPosn;
+          if(bufferPosn < -1) {
+            bufferPosn = -1;
+          }
           delPosn = 0;
         }
       }

+ 37 - 22
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestLineReader.java

@@ -58,7 +58,7 @@ public class TestLineReader {
      * Check Condition
      *  In the second key value pair, the value should contain 
      *  "</"  from currentToken and
-     *  "id>" from next token 
+     *  "id>" from next token
      */  
     
     Delimiter="</entity>"; 
@@ -80,20 +80,21 @@ public class TestLineReader {
     String TestPartOfInput = CurrentBufferTailToken+NextBufferHeadToken;
   
     int BufferSize=64 * 1024;
-    int numberOfCharToFillTheBuffer=BufferSize-CurrentBufferTailToken.length();
+    int numberOfCharToFillTheBuffer =
+            BufferSize - CurrentBufferTailToken.length();
     StringBuilder fillerString=new StringBuilder();
-    for (int i=0;i<numberOfCharToFillTheBuffer;i++) {  
+    for (int i=0; i<numberOfCharToFillTheBuffer; i++) {
       fillerString.append('a'); // char 'a' as a filler for the test string
     }
 
     TestData = fillerString + TestPartOfInput;
     lineReader = new LineReader(
-        new ByteArrayInputStream(TestData.getBytes()),Delimiter.getBytes());
+        new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
     
     line = new Text();
     
-    lineReader.readLine(line); 
-    Assert.assertEquals(fillerString.toString(),line.toString());
+    lineReader.readLine(line);
+    Assert.assertEquals(fillerString.toString(), line.toString());
     
     lineReader.readLine(line);
     Assert.assertEquals(Expected, line.toString());
@@ -107,35 +108,49 @@ public class TestLineReader {
     Delimiter = "record";
     StringBuilder TestStringBuilder = new StringBuilder();
     
-    TestStringBuilder.append(Delimiter+"Kerala ");
-    TestStringBuilder.append(Delimiter+"Bangalore");
-    TestStringBuilder.append(Delimiter+" North Korea");
-    TestStringBuilder.append(Delimiter+Delimiter+
+    TestStringBuilder.append(Delimiter + "Kerala ");
+    TestStringBuilder.append(Delimiter + "Bangalore");
+    TestStringBuilder.append(Delimiter + " North Korea");
+    TestStringBuilder.append(Delimiter + Delimiter+
                         "Guantanamo");
-    TestStringBuilder.append(Delimiter+"ecord"+"recor"+"core"); //~EOF with 're'
+    TestStringBuilder.append(Delimiter + "ecord"
+            + "recor" + "core"); //~EOF with 're'
     
     TestData=TestStringBuilder.toString();
     
     lineReader = new LineReader(
-        new ByteArrayInputStream(TestData.getBytes()),Delimiter.getBytes());
-    
-    lineReader.readLine(line); 
-    Assert.assertEquals("",line.toString());
-    lineReader.readLine(line); 
-    Assert.assertEquals("Kerala ",line.toString());
+        new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
+
+    lineReader.readLine(line);
+    Assert.assertEquals("", line.toString());
+    lineReader.readLine(line);
+    Assert.assertEquals("Kerala ", line.toString());
     
     lineReader.readLine(line); 
-    Assert.assertEquals("Bangalore",line.toString());
+    Assert.assertEquals("Bangalore", line.toString());
     
     lineReader.readLine(line); 
-    Assert.assertEquals(" North Korea",line.toString());
+    Assert.assertEquals(" North Korea", line.toString());
     
     lineReader.readLine(line); 
-    Assert.assertEquals("",line.toString());
+    Assert.assertEquals("", line.toString());
     lineReader.readLine(line); 
-    Assert.assertEquals("Guantanamo",line.toString());
+    Assert.assertEquals("Guantanamo", line.toString());
     
     lineReader.readLine(line); 
-    Assert.assertEquals(("ecord"+"recor"+"core"),line.toString());
+    Assert.assertEquals(("ecord"+"recor"+"core"), line.toString());
+
+    // Test 3
+    // The test scenario is such that,
+    // aaaabccc split by aaab
+    TestData = "aaaabccc";
+    Delimiter = "aaab";
+    lineReader = new LineReader(
+        new ByteArrayInputStream(TestData.getBytes()), Delimiter.getBytes());
+
+    lineReader.readLine(line);
+    Assert.assertEquals("a", line.toString());
+    lineReader.readLine(line);
+    Assert.assertEquals("ccc", line.toString());
   }
 }