Explorar o código

HADOOP-550. Disable automatic UTF-8 validation in Text. Contributed by Hairong & Mahadev.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@452941 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting %!s(int64=19) %!d(string=hai) anos
pai
achega
3be71187ef

+ 4 - 0
CHANGES.txt

@@ -128,6 +128,10 @@ Trunk (unreleased changes)
     tasktracker correctly report the task as failed to the jobtracker,
     tasktracker correctly report the task as failed to the jobtracker,
     so that it will be rescheduled.  (omalley via cutting)
     so that it will be rescheduled.  (omalley via cutting)
 
 
+31. HADOOP-550.  Disable automatic UTF-8 validation in Text.  This
+    permits, e.g., TextInputFormat to again operate on non-UTF-8 data.
+    (Hairong and Mahadev via cutting)
+
 
 
 Release 0.6.2 - 2006-09-18
 Release 0.6.2 - 2006-09-18
 
 

+ 19 - 34
src/java/org/apache/hadoop/io/Text.java

@@ -64,10 +64,8 @@ public class Text implements WritableComparable {
   }
   }
 
 
   /** Construct from a string. 
   /** Construct from a string. 
-   * @exception CharacterCodingExcetpion if the string contains 
-   *            invalid codepoints or unpaired surrogates
    */
    */
-  public Text(String string) throws CharacterCodingException {
+  public Text(String string) {
     set(string);
     set(string);
   }
   }
 
 
@@ -77,9 +75,8 @@ public class Text implements WritableComparable {
   }
   }
 
 
   /** Construct from a byte array.
   /** Construct from a byte array.
-   * @exception CharacterCodingExcetpion if the array has invalid UTF8 bytes 
    */
    */
-  public Text(byte[] utf8) throws CharacterCodingException {
+  public Text(byte[] utf8)  {
     set(utf8);
     set(utf8);
   }
   }
   
   
@@ -160,29 +157,26 @@ public class Text implements WritableComparable {
     }
     }
   }  
   }  
   /** Set to contain the contents of a string. 
   /** Set to contain the contents of a string. 
-   * @exception CharacterCodingException if the string contains 
-   *       invalid codepoints or unpaired surrogate
    */
    */
-  public void set(String string) throws CharacterCodingException {
-    ByteBuffer bb = encode(string);
-    bytes = bb.array();
-    length = bb.limit();
+  public void set(String string) {
+    try {
+      ByteBuffer bb = encode(string, true);
+      bytes = bb.array();
+      length = bb.limit();
+    }catch(CharacterCodingException e) {
+      throw new RuntimeException("Should not have happened " + e.toString()); 
+    }
   }
   }
 
 
   /** Set to a utf8 byte array
   /** Set to a utf8 byte array
-   * @exception CharacterCodingException if the array contains invalid UTF8 code  
    */
    */
-  public void set(byte[] utf8) throws CharacterCodingException {
+  public void set(byte[] utf8) {
     set(utf8, 0, utf8.length);
     set(utf8, 0, utf8.length);
   }
   }
   
   
   /** copy a text. */
   /** copy a text. */
   public void set(Text other) {
   public void set(Text other) {
-    try {
-      set(other.bytes, 0, other.length);
-    } catch (CharacterCodingException e) {
-      throw new RuntimeException("bad Text UTF8 encoding", e);
-    }
+    set(other.bytes, 0, other.length);
   }
   }
 
 
   /**
   /**
@@ -191,9 +185,7 @@ public class Text implements WritableComparable {
    * @param start the first position of the new string
    * @param start the first position of the new string
    * @param len the number of bytes of the new string
    * @param len the number of bytes of the new string
    */
    */
-  public void set(byte[] utf8, int start, int len 
-                  ) throws CharacterCodingException{
-    validateUTF8(utf8, start, len);
+  public void set(byte[] utf8, int start, int len) {
     setCapacity(len);
     setCapacity(len);
     System.arraycopy(utf8, start, bytes, 0, len);
     System.arraycopy(utf8, start, bytes, 0, len);
     this.length = len;
     this.length = len;
@@ -221,22 +213,16 @@ public class Text implements WritableComparable {
     try {
     try {
       return decode(bytes, 0, length);
       return decode(bytes, 0, length);
     } catch (CharacterCodingException e) { 
     } catch (CharacterCodingException e) { 
-      //bytes is supposed to contain valid utf8, therefore, 
-      // this should never happen
       return null;
       return null;
     }
     }
   }
   }
   
   
   /** deserialize 
   /** deserialize 
-   * check if the received bytes are valid utf8 code. 
-   * if not throws MalformedInputException
-   * @see Writable#readFields(DataInput)
    */
    */
   public void readFields(DataInput in) throws IOException {
   public void readFields(DataInput in) throws IOException {
     length = WritableUtils.readVInt(in);
     length = WritableUtils.readVInt(in);
     setCapacity(length);
     setCapacity(length);
     in.readFully(bytes, 0, length);
     in.readFully(bytes, 0, length);
-    validateUTF8(bytes);
   }
   }
 
 
   /** Skips over one Text in the input. */
   /** Skips over one Text in the input. */
@@ -251,7 +237,7 @@ public class Text implements WritableComparable {
    * @see Writable#write(DataOutput)
    * @see Writable#write(DataOutput)
    */
    */
   public void write(DataOutput out) throws IOException {
   public void write(DataOutput out) throws IOException {
-    WritableUtils.writeVInt(out, length); // out.writeInt(length);
+    WritableUtils.writeVInt(out, length);
     out.write(bytes, 0, length);
     out.write(bytes, 0, length);
   }
   }
 
 
@@ -313,15 +299,15 @@ public class Text implements WritableComparable {
   /**
   /**
    * Converts the provided byte array to a String using the
    * Converts the provided byte array to a String using the
    * UTF-8 encoding. If the input is malformed,
    * UTF-8 encoding. If the input is malformed,
-   * throws a MalformedInputException.
+   * replace by a default value.
    */
    */
   public static String decode(byte[] utf8) throws CharacterCodingException {
   public static String decode(byte[] utf8) throws CharacterCodingException {
-    return decode(ByteBuffer.wrap(utf8), false);
+    return decode(ByteBuffer.wrap(utf8), true);
   }
   }
   
   
   public static String decode(byte[] utf8, int start, int length) 
   public static String decode(byte[] utf8, int start, int length) 
       throws CharacterCodingException {
       throws CharacterCodingException {
-      return decode(ByteBuffer.wrap(utf8, start, length), false);
+      return decode(ByteBuffer.wrap(utf8, start, length), true);
   }
   }
   
   
   /**
   /**
@@ -358,14 +344,14 @@ public class Text implements WritableComparable {
   /**
   /**
    * Converts the provided String to bytes using the
    * Converts the provided String to bytes using the
    * UTF-8 encoding. If the input is malformed,
    * UTF-8 encoding. If the input is malformed,
-   * throws a MalformedInputException.
+   * invalid chars are replaced by a default value.
    * @return ByteBuffer: bytes stores at ByteBuffer.array() 
    * @return ByteBuffer: bytes stores at ByteBuffer.array() 
    *                     and length is ByteBuffer.limit()
    *                     and length is ByteBuffer.limit()
    */
    */
 
 
   public static ByteBuffer encode(String string)
   public static ByteBuffer encode(String string)
     throws CharacterCodingException {
     throws CharacterCodingException {
-    return encode(string, false);
+    return encode(string, true);
   }
   }
 
 
   /**
   /**
@@ -399,7 +385,6 @@ public class Text implements WritableComparable {
     int length = WritableUtils.readVInt(in);
     int length = WritableUtils.readVInt(in);
     byte [] bytes = new byte[length];
     byte [] bytes = new byte[length];
     in.readFully(bytes, 0, length);
     in.readFully(bytes, 0, length);
-    validateUTF8(bytes);
     return decode(bytes);
     return decode(bytes);
   }
   }
 
 

+ 2 - 6
src/java/org/apache/hadoop/record/Utils.java

@@ -255,12 +255,8 @@ public class Utils {
             sb.append(ch);
             sb.append(ch);
           }
           }
         }
         }
-        try {
-          return new Text(sb.toString());
-        } catch (CharacterCodingException ex) {
-          ex.printStackTrace();
-          return new Text();
-        }
+        
+        return new Text(sb.toString());
     }
     }
     
     
     /**
     /**

+ 54 - 0
src/test/org/apache/hadoop/io/TestTextNonUTF8.java

@@ -0,0 +1,54 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import junit.framework.TestCase;
+
+import java.nio.charset.MalformedInputException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.*;
+import java.util.Arrays;
+
+/** Unit tests for NonUTF8. */
+public class TestTextNonUTF8 extends TestCase {
+  private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.TestTextNonUTF8");
+
+  public void testNonUTF8() throws Exception{
+   // this is a non UTF8 byte array
+   byte b[] = {-0x01, -0x01, -0x01, -0x01, -0x01, -0x01, -0x01};
+   boolean nonUTF8 = false;
+   Text t = new Text(b);
+   try{
+     Text.validateUTF8(b);
+   }catch(MalformedInputException me){
+     nonUTF8 = false;
+   }
+   // asserting that the byte array is non utf8
+   assertFalse(nonUTF8);
+   byte ret[] = t.getBytes();
+   // asseting that the byte array are the same when the Text
+   // object is created.
+   assertTrue(Arrays.equals(b, ret));
+  }
+
+  public static void main(String[] args)  throws Exception
+  {
+    TestTextNonUTF8 test = new TestTextNonUTF8();
+    test.testNonUTF8();
+  }
+}