12 năm trước cách đây · b4228dd662
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -625,6 +625,8 @@ Release 1.2.0 - 2013.04.16
 
				     HADOOP-9543. TestFsShellReturnCode may fail if the hardcoded user "admin"
			
 
				     is not a valid user in the loacl OS.  (szetszwo)
			
 
				 
			
 
				+    HADOOP-9544. Backport UTF8 encoding fixes.  (Chris Nauroth via szetszwo)
			
 
				+
			
 
				 Release 1.1.2 - 2013.01.30
			
 
				 
			
 
				   INCOMPATIBLE CHANGES
			
--- a/src/core/org/apache/hadoop/io/SequenceFile.java
+++ b/src/core/org/apache/hadoop/io/SequenceFile.java
@@ -1525,10 +1525,10 @@ public class SequenceFile {
 
				         UTF8 className = new UTF8();
			
 
				 
			
 
				         className.readFields(in);
			
 
				-        keyClassName = className.toString(); // key class name
			
 
				+        keyClassName = className.toStringChecked(); // key class name
			
 
				 
			
 
				         className.readFields(in);
			
 
				-        valClassName = className.toString(); // val class name
			
 
				+        valClassName = className.toStringChecked(); // val class name
			
 
				       } else {
			
 
				         keyClassName = Text.readString(in);
			
 
				         valClassName = Text.readString(in);
			
--- a/src/core/org/apache/hadoop/io/UTF8.java
+++ b/src/core/org/apache/hadoop/io/UTF8.java
@@ -21,17 +21,27 @@ package org.apache.hadoop.io;
 
				 import java.io.IOException;
			
 
				 import java.io.DataInput;
			
 
				 import java.io.DataOutput;
			
 
				+import java.io.UTFDataFormatException;
			
 
				 
			
 
				+import org.apache.hadoop.util.StringUtils;
			
 
				 
			
 
				 import org.apache.commons.logging.*;
			
 
				+import org.apache.hadoop.classification.InterfaceAudience;
			
 
				+import org.apache.hadoop.classification.InterfaceStability;
			
 
				 
			
 
				 /** A WritableComparable for strings that uses the UTF8 encoding.
			
 
				  * 
			
 
				  * <p>Also includes utilities for efficiently reading and writing UTF-8.
			
 
				  *
			
 
				+ * Note that this decodes UTF-8 but actually encodes CESU-8, a variant of
			
 
				+ * UTF-8: see http://en.wikipedia.org/wiki/CESU-8
			
 
				+ *
			
 
				  * @deprecated replaced by Text
			
 
				  */
			
 
				-public class UTF8 implements WritableComparable {
			
 
				+@Deprecated
			
 
				+@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
			
 
				+@InterfaceStability.Stable
			
 
				+public class UTF8 implements WritableComparable<UTF8> {
			
 
				   private static final Log LOG= LogFactory.getLog(UTF8.class);
			
 
				   private static final DataInputBuffer IBUF = new DataInputBuffer();
			
 
				 
			
@@ -105,6 +115,7 @@ public class UTF8 implements WritableComparable {
 
				     System.arraycopy(other.bytes, 0, bytes, 0, length);
			
 
				   }
			
 
				 
			
 
				+  @Override
			
 
				   public void readFields(DataInput in) throws IOException {
			
 
				     length = in.readUnsignedShort();
			
 
				     if (bytes == null || bytes.length < length)
			
@@ -118,21 +129,23 @@ public class UTF8 implements WritableComparable {
 
				     WritableUtils.skipFully(in, length);
			
 
				   }
			
 
				 
			
 
				+  @Override
			
 
				   public void write(DataOutput out) throws IOException {
			
 
				     out.writeShort(length);
			
 
				     out.write(bytes, 0, length);
			
 
				   }
			
 
				 
			
 
				   /** Compare two UTF8s. */
			
 
				-  public int compareTo(Object o) {
			
 
				-    UTF8 that = (UTF8)o;
			
 
				+  @Override
			
 
				+  public int compareTo(UTF8 o) {
			
 
				     return WritableComparator.compareBytes(bytes, 0, length,
			
 
				-                                           that.bytes, 0, that.length);
			
 
				+                                           o.bytes, 0, o.length);
			
 
				   }
			
 
				 
			
 
				   /** Convert to a String. */
			
 
				+  @Override
			
 
				   public String toString() {
			
 
				-    StringBuffer buffer = new StringBuffer(length);
			
 
				+    StringBuilder buffer = new StringBuilder(length);
			
 
				     try {
			
 
				       synchronized (IBUF) {
			
 
				         IBUF.reset(bytes, length);
			
@@ -143,8 +156,24 @@ public class UTF8 implements WritableComparable {
 
				     }
			
 
				     return buffer.toString();
			
 
				   }
			
 
				+  
			
 
				+  /**
			
 
				+   * Convert to a string, checking for valid UTF8.
			
 
				+   * @return the converted string
			
 
				+   * @throws UTFDataFormatException if the underlying bytes contain invalid
			
 
				+   * UTF8 data.
			
 
				+   */
			
 
				+  public String toStringChecked() throws IOException {
			
 
				+    StringBuilder buffer = new StringBuilder(length);
			
 
				+    synchronized (IBUF) {
			
 
				+      IBUF.reset(bytes, length);
			
 
				+      readChars(IBUF, buffer, length);
			
 
				+    }
			
 
				+    return buffer.toString();
			
 
				+  }
			
 
				 
			
 
				   /** Returns true iff <code>o</code> is a UTF8 with the same contents.  */
			
 
				+  @Override
			
 
				   public boolean equals(Object o) {
			
 
				     if (!(o instanceof UTF8))
			
 
				       return false;
			
@@ -156,6 +185,7 @@ public class UTF8 implements WritableComparable {
 
				                                              that.bytes, 0, that.length) == 0;
			
 
				   }
			
 
				 
			
 
				+  @Override
			
 
				   public int hashCode() {
			
 
				     return WritableComparator.hashBytes(bytes, length);
			
 
				   }
			
@@ -166,6 +196,7 @@ public class UTF8 implements WritableComparable {
 
				       super(UTF8.class);
			
 
				     }
			
 
				 
			
 
				+    @Override
			
 
				     public int compare(byte[] b1, int s1, int l1,
			
 
				                        byte[] b2, int s2, int l2) {
			
 
				       int n1 = readUnsignedShort(b1, s1);
			
@@ -198,19 +229,32 @@ public class UTF8 implements WritableComparable {
 
				     return result;
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Convert a UTF-8 encoded byte array back into a string.
			
 
				+   *
			
 
				+   * @throws IOException if the byte array is invalid UTF8
			
 
				+   */
			
 
				+  public static String fromBytes(byte[] bytes) throws IOException {
			
 
				+    DataInputBuffer dbuf = new DataInputBuffer();
			
 
				+    dbuf.reset(bytes, 0, bytes.length);
			
 
				+    StringBuilder buf = new StringBuilder(bytes.length);
			
 
				+    readChars(dbuf, buf, bytes.length);
			
 
				+    return buf.toString();
			
 
				+  }
			
 
				+
			
 
				   /** Read a UTF-8 encoded string.
			
 
				    *
			
 
				    * @see DataInput#readUTF()
			
 
				    */
			
 
				   public static String readString(DataInput in) throws IOException {
			
 
				     int bytes = in.readUnsignedShort();
			
 
				-    StringBuffer buffer = new StringBuffer(bytes);
			
 
				+    StringBuilder buffer = new StringBuilder(bytes);
			
 
				     readChars(in, buffer, bytes);
			
 
				     return buffer.toString();
			
 
				   }
			
 
				 
			
 
				-  private static void readChars(DataInput in, StringBuffer buffer, int nBytes)
			
 
				-    throws IOException {
			
 
				+  private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
			
 
				+    throws UTFDataFormatException, IOException {
			
 
				     DataOutputBuffer obuf = OBUF_FACTORY.get();
			
 
				     obuf.reset();
			
 
				     obuf.write(in, nBytes);
			
@@ -219,18 +263,60 @@ public class UTF8 implements WritableComparable {
 
				     while (i < nBytes) {
			
 
				       byte b = bytes[i++];
			
 
				       if ((b & 0x80) == 0) {
			
 
				+        // 0b0xxxxxxx: 1-byte sequence
			
 
				         buffer.append((char)(b & 0x7F));
			
 
				-      } else if ((b & 0xE0) != 0xE0) {
			
 
				+      } else if ((b & 0xE0) == 0xC0) {
			
 
				+        if (i >= nBytes) {
			
 
				+          throw new UTFDataFormatException("Truncated UTF8 at " +
			
 
				+              StringUtils.byteToHexString(bytes, i - 1, 1));
			
 
				+        }
			
 
				+        // 0b110xxxxx: 2-byte sequence
			
 
				         buffer.append((char)(((b & 0x1F) << 6)
			
 
				             | (bytes[i++] & 0x3F)));
			
 
				-      } else {
			
 
				+      } else if ((b & 0xF0) == 0xE0) {
			
 
				+        // 0b1110xxxx: 3-byte sequence
			
 
				+        if (i + 1 >= nBytes) {
			
 
				+          throw new UTFDataFormatException("Truncated UTF8 at " +
			
 
				+              StringUtils.byteToHexString(bytes, i - 1, 2));
			
 
				+        }
			
 
				         buffer.append((char)(((b & 0x0F) << 12)
			
 
				             | ((bytes[i++] & 0x3F) << 6)
			
 
				             |  (bytes[i++] & 0x3F)));
			
 
				+      } else if ((b & 0xF8) == 0xF0) {
			
 
				+        if (i + 2 >= nBytes) {
			
 
				+          throw new UTFDataFormatException("Truncated UTF8 at " +
			
 
				+              StringUtils.byteToHexString(bytes, i - 1, 3));
			
 
				+        }
			
 
				+        // 0b11110xxx: 4-byte sequence
			
 
				+        int codepoint =
			
 
				+            ((b & 0x07) << 18)
			
 
				+          | ((bytes[i++] & 0x3F) <<  12)
			
 
				+          | ((bytes[i++] & 0x3F) <<  6)
			
 
				+          | ((bytes[i++] & 0x3F));
			
 
				+        buffer.append(highSurrogate(codepoint))
			
 
				+              .append(lowSurrogate(codepoint));
			
 
				+      } else {
			
 
				+        // The UTF8 standard describes 5-byte and 6-byte sequences, but
			
 
				+        // these are no longer allowed as of 2003 (see RFC 3629)
			
 
				+
			
 
				+        // Only show the next 6 bytes max in the error code - in case the
			
 
				+        // buffer is large, this will prevent an exceedingly large message.
			
 
				+        int endForError = Math.min(i + 5, nBytes);
			
 
				+        throw new UTFDataFormatException("Invalid UTF8 at " +
			
 
				+            StringUtils.byteToHexString(bytes, i - 1, endForError));
			
 
				       }
			
 
				     }
			
 
				   }
			
 
				 
			
 
				+  private static char highSurrogate(int codePoint) {
			
 
				+    return (char) ((codePoint >>> 10)
			
 
				+        + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
			
 
				+  }
			
 
				+
			
 
				+  private static char lowSurrogate(int codePoint) {
			
 
				+    return (char) ((codePoint & 0x3ff) + Character.MIN_LOW_SURROGATE);
			
 
				+  }
			
 
				+
			
 
				   /** Write a UTF-8 encoded string.
			
 
				    *
			
 
				    * @see DataOutput#writeUTF(String)
			
@@ -257,7 +343,7 @@ public class UTF8 implements WritableComparable {
 
				     int utf8Length = 0;
			
 
				     for (int i = 0; i < stringLength; i++) {
			
 
				       int c = string.charAt(i);
			
 
				-      if ((c >= 0x0001) && (c <= 0x007F)) {
			
 
				+      if (c <= 0x007F) {
			
 
				         utf8Length++;
			
 
				       } else if (c > 0x07FF) {
			
 
				         utf8Length += 3;
			
@@ -274,7 +360,7 @@ public class UTF8 implements WritableComparable {
 
				     final int end = start + length;
			
 
				     for (int i = start; i < end; i++) {
			
 
				       int code = s.charAt(i);
			
 
				-      if (code >= 0x01 && code <= 0x7F) {
			
 
				+      if (code <= 0x7F) {
			
 
				         out.writeByte((byte)code);
			
 
				       } else if (code <= 0x07FF) {
			
 
				         out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
			
--- a/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
+++ b/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
@@ -1822,7 +1822,7 @@ public class FSImage extends Storage {
 
				   static private final UTF8 U_STR = new UTF8();
			
 
				   public static String readString(DataInputStream in) throws IOException {
			
 
				     U_STR.readFields(in);
			
 
				-    return U_STR.toString();
			
 
				+    return U_STR.toStringChecked();
			
 
				   }
			
 
				 
			
 
				   static String readString_EmptyAsNull(DataInputStream in) throws IOException {
			
--- a/src/test/org/apache/hadoop/io/TestUTF8.java
+++ b/src/test/org/apache/hadoop/io/TestUTF8.java
@@ -19,16 +19,22 @@
 
				 package org.apache.hadoop.io;
			
 
				 
			
 
				 import junit.framework.TestCase;
			
 
				+import java.io.IOException;
			
 
				+import java.io.UTFDataFormatException;
			
 
				 import java.util.Random;
			
 
				 
			
 
				+import org.apache.hadoop.test.GenericTestUtils;
			
 
				+import org.apache.hadoop.util.StringUtils;
			
 
				+
			
 
				 /** Unit tests for UTF8. */
			
 
				+@SuppressWarnings("deprecation")
			
 
				 public class TestUTF8 extends TestCase {
			
 
				   public TestUTF8(String name) { super(name); }
			
 
				 
			
 
				   private static final Random RANDOM = new Random();
			
 
				 
			
 
				   public static String getTestString() throws Exception {
			
 
				-    StringBuffer buffer = new StringBuffer();
			
 
				+    StringBuilder buffer = new StringBuilder();
			
 
				     int length = RANDOM.nextInt(100);
			
 
				     for (int i = 0; i < length; i++) {
			
 
				       buffer.append((char)(RANDOM.nextInt(Character.MAX_VALUE)));
			
@@ -37,13 +43,13 @@ public class TestUTF8 extends TestCase {
 
				   }
			
 
				 
			
 
				   public void testWritable() throws Exception {
			
 
				-    for (int i = 0; i < 10; i++) {
			
 
				+    for (int i = 0; i < 10000; i++) {
			
 
				       TestWritable.testWritable(new UTF8(getTestString()));
			
 
				     }
			
 
				   }
			
 
				 
			
 
				   public void testGetBytes() throws Exception {
			
 
				-    for (int i = 0; i < 10; i++) {
			
 
				+    for (int i = 0; i < 10000; i++) {
			
 
				 
			
 
				       // generate a random string
			
 
				       String before = getTestString();
			
@@ -57,7 +63,7 @@ public class TestUTF8 extends TestCase {
 
				     DataOutputBuffer out = new DataOutputBuffer();
			
 
				     DataInputBuffer in = new DataInputBuffer();
			
 
				 
			
 
				-    for (int i = 0; i < 10; i++) {
			
 
				+    for (int i = 0; i < 10000; i++) {
			
 
				       // generate a random string
			
 
				       String before = getTestString();
			
 
				 
			
@@ -68,19 +74,96 @@ public class TestUTF8 extends TestCase {
 
				       // test that it reads correctly
			
 
				       in.reset(out.getData(), out.getLength());
			
 
				       String after = UTF8.readString(in);
			
 
				-      assertTrue(before.equals(after));
			
 
				+      assertEquals(before, after);
			
 
				 
			
 
				       // test that it reads correctly with DataInput
			
 
				       in.reset(out.getData(), out.getLength());
			
 
				       String after2 = in.readUTF();
			
 
				-      assertTrue(before.equals(after2));
			
 
				+      assertEquals(before, after2);
			
 
				 
			
 
				       // test that it is compatible with Java's other decoder
			
 
				       String after3 = new String(out.getData(), 2, out.getLength()-2, "UTF-8");
			
 
				-      assertTrue(before.equals(after3));
			
 
				+      assertEquals(before, after3);
			
 
				 
			
 
				     }
			
 
				 
			
 
				   }
			
 
				-	
			
 
				+
			
 
				+  public void testNullEncoding() throws Exception {
			
 
				+    String s = new String(new char[] { 0 });
			
 
				+
			
 
				+    DataOutputBuffer dob = new DataOutputBuffer();
			
 
				+    new UTF8(s).write(dob);
			
 
				+
			
 
				+    assertEquals(s, new String(dob.getData(), 2, dob.getLength()-2, "UTF-8"));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Test encoding and decoding of UTF8 outside the basic multilingual plane.
			
 
				+   *
			
 
				+   * This is a regression test for HADOOP-9103.
			
 
				+   */
			
 
				+  public void testNonBasicMultilingualPlane() throws Exception {
			
 
				+    // Test using the "CAT FACE" character (U+1F431)
			
 
				+    // See http://www.fileformat.info/info/unicode/char/1f431/index.htm
			
 
				+    String catFace = "\uD83D\uDC31";
			
 
				+
			
 
				+    // This encodes to 4 bytes in UTF-8:
			
 
				+    byte[] encoded = catFace.getBytes("UTF-8");
			
 
				+    assertEquals(4, encoded.length);
			
 
				+    assertEquals("f09f90b1", StringUtils.byteToHexString(encoded));
			
 
				+
			
 
				+    // Decode back to String using our own decoder
			
 
				+    String roundTrip = UTF8.fromBytes(encoded);
			
 
				+    assertEquals(catFace, roundTrip);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Test that decoding invalid UTF8 throws an appropriate error message.
			
 
				+   */
			
 
				+  public void testInvalidUTF8() throws Exception {
			
 
				+    byte[] invalid = new byte[] {
			
 
				+        0x01, 0x02, (byte)0xff, (byte)0xff, 0x01, 0x02, 0x03, 0x04, 0x05 };
			
 
				+    try {
			
 
				+      UTF8.fromBytes(invalid);
			
 
				+      fail("did not throw an exception");
			
 
				+    } catch (UTFDataFormatException utfde) {
			
 
				+      GenericTestUtils.assertExceptionContains(
			
 
				+          "Invalid UTF8 at ffff01020304", utfde);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Test for a 5-byte UTF8 sequence, which is now considered illegal.
			
 
				+   */
			
 
				+  public void test5ByteUtf8Sequence() throws Exception {
			
 
				+    byte[] invalid = new byte[] {
			
 
				+        0x01, 0x02, (byte)0xf8, (byte)0x88, (byte)0x80,
			
 
				+        (byte)0x80, (byte)0x80, 0x04, 0x05 };
			
 
				+    try {
			
 
				+      UTF8.fromBytes(invalid);
			
 
				+      fail("did not throw an exception");
			
 
				+    } catch (UTFDataFormatException utfde) {
			
 
				+      GenericTestUtils.assertExceptionContains(
			
 
				+          "Invalid UTF8 at f88880808004", utfde);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test that decoding invalid UTF8 due to truncation yields the correct
			
 
				+   * exception type.
			
 
				+   */
			
 
				+  public void testInvalidUTF8Truncated() throws Exception {
			
 
				+    // Truncated CAT FACE character -- this is a 4-byte sequence, but we
			
 
				+    // only have the first three bytes.
			
 
				+    byte[] truncated = new byte[] {
			
 
				+        (byte)0xF0, (byte)0x9F, (byte)0x90 };
			
 
				+    try {
			
 
				+      UTF8.fromBytes(truncated);
			
 
				+      fail("did not throw an exception");
			
 
				+    } catch (UTFDataFormatException utfde) {
			
 
				+      GenericTestUtils.assertExceptionContains(
			
 
				+          "Truncated UTF8 at f09f90", utfde);
			
 
				+    }
			
 
				+  }
			
 
				 }
			
--- a/src/test/org/apache/hadoop/test/GenericTestUtils.java
+++ b/src/test/org/apache/hadoop/test/GenericTestUtils.java
@@ -0,0 +1,34 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.test;
			
 
				+
			
 
				+import org.apache.hadoop.util.StringUtils;
			
 
				+import org.junit.Assert;
			
 
				+
			
 
				+/**
			
 
				+ * Test provides some very generic helpers which might be used across the tests
			
 
				+ */
			
 
				+public abstract class GenericTestUtils {
			
 
				+
			
 
				+  public static void assertExceptionContains(String string, Throwable t) {
			
 
				+    String msg = t.getMessage();
			
 
				+    Assert.assertTrue(
			
 
				+        "Expected to find '" + string + "' but got unexpected exception:"
			
 
				+        + StringUtils.stringifyException(t), msg.contains(string));
			
 
				+  }  
			
 
				+}