소스 검색

HADOOP-473. Fix TextInputFormat to correctly handle more EOL formats. Contributed by Dennis Kubes & James White.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@436916 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 18 년 전
부모
커밋
4585c5af5b
2개의 변경된 파일26개의 추가작업 그리고 2개의 파일을 삭제
  1. 5 0
      CHANGES.txt
  2. 21 2
      src/java/org/apache/hadoop/mapred/TextInputFormat.java

+ 5 - 0
CHANGES.txt

@@ -69,6 +69,11 @@ Trunk (unreleased changes)
     scheduling priority for daemons.  (Vetle Roeim via cutting)
 
 
+17. HADOOP-473.  Fix TextInputFormat to correctly handle more EOL
+    formats.  Things now work correctly with CR, LF or CRLF.
+    (Dennis Kubes & James White via cutting)
+
+
 Release 0.5.0 - 2006-08-04
 
  1. HADOOP-352.  Fix shell scripts to use /bin/sh instead of

+ 21 - 2
src/java/org/apache/hadoop/mapred/TextInputFormat.java

@@ -44,7 +44,16 @@ public class TextInputFormat extends InputFormatBase {
       in.seek(start-1);
       while (in.getPos() < end) {    // scan to the next newline in the file
         char c = (char)in.read();
-        if (c == '\r' || c == '\n') {
+        if (c == '\n')
+          break;
+          
+        if (c == '\r') {       
+          long curPos = in.getPos();
+          char nextC = (char)in.read();
+          if (nextC != '\n') {
+            in.seek(curPos);
+          }
+
           break;
         }
       }
@@ -90,8 +99,18 @@ public class TextInputFormat extends InputFormatBase {
         break;
 
       char c = (char)b;              // bug: this assumes eight-bit characters.
-      if (c == '\r' || c == '\n')
+      if (c == '\n')
         break;
+        
+      if (c == '\r') {       
+        long curPos = in.getPos();
+        char nextC = (char)in.read();
+        if (nextC != '\n') {
+          in.seek(curPos);
+        }
+
+        break;
+      }
 
       buffer.append(c);
     }