пре 19 година · dddd91ba25
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -85,6 +85,11 @@ Trunk (unreleased changes)
 
				 24. HADOOP-385.  Fix some bugs in record io code generation.
			
 
				     (Milind Bhandarkar via cutting)
			
 
				 
			
 
				+25. HADOOP-302.  Add new Text class to replace UTF8, removing
			
 
				+    limitations of that class.  Also refactor utility methods for
			
 
				+    writing zero-compressed integers (VInts and VLongs).
			
 
				+    (Hairong Kuang via cutting)
			
 
				+
			
 
				 
			
 
				 Release 0.4.0 - 2006-06-28
			
 
				 
			
--- a/src/java/org/apache/hadoop/io/Text.java
+++ b/src/java/org/apache/hadoop/io/Text.java
@@ -0,0 +1,568 @@
 
				+/**
			
 
				+ * Copyright 2005 The Apache Software Foundation
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.io;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.io.DataInput;
			
 
				+import java.io.DataOutput;
			
 
				+import java.nio.ByteBuffer;
			
 
				+import java.nio.CharBuffer;
			
 
				+import java.nio.charset.CharacterCodingException;
			
 
				+import java.nio.charset.Charset;
			
 
				+import java.nio.charset.CharsetDecoder;
			
 
				+import java.nio.charset.CharsetEncoder;
			
 
				+import java.nio.charset.CodingErrorAction;
			
 
				+import java.nio.charset.MalformedInputException;
			
 
				+import java.text.CharacterIterator;
			
 
				+import java.text.StringCharacterIterator;
			
 
				+
			
 
				+import org.apache.commons.logging.Log;
			
 
				+import org.apache.commons.logging.LogFactory;
			
 
				+
			
 
				+/** This class stores text using standard UTF8 encoding.  It provides methods
			
 
				+ * to serialize, deserialize, and compare texts at byte level.  The type of
			
 
				+ * length is integer and is serialized using zero-compressed format.  <p>In
			
 
				+ * addition, it provides methods for string traversal without converting the
			
 
				+ * byte array to a string.  <p>Also includes utilities for
			
 
				+ * serializing/deserialing a string, coding/decoding a string, checking if a
			
 
				+ * byte array contains valid UTF8 code, calculating the length of an encoded
			
 
				+ * string.
			
 
				+ */
			
 
				+public class Text implements WritableComparable {
			
 
				+  private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.Text");
			
 
				+  
			
 
				+  private static final CharsetDecoder DECODER = 
			
 
				+    Charset.forName("UTF-8").newDecoder().
			
 
				+    onMalformedInput(CodingErrorAction.REPORT).
			
 
				+    onUnmappableCharacter(CodingErrorAction.REPORT);
			
 
				+  private static final CharsetEncoder ENCODER = 
			
 
				+    Charset.forName("UTF-8").newEncoder().
			
 
				+    onMalformedInput(CodingErrorAction.REPORT).
			
 
				+    onUnmappableCharacter(CodingErrorAction.REPORT);
			
 
				+
			
 
				+  private static final byte [] EMPTY_BYTES = new byte[0];
			
 
				+  
			
 
				+  private byte[] bytes;
			
 
				+  private int length;
			
 
				+
			
 
				+  public Text() {
			
 
				+    bytes = EMPTY_BYTES;
			
 
				+  }
			
 
				+
			
 
				+  /** Construct from a string. 
			
 
				+   * @exception CharacterCodingExcetpion if the string contains 
			
 
				+   *            invalid codepoints or unpaired surrogates
			
 
				+   */
			
 
				+  public Text(String string) throws CharacterCodingException {
			
 
				+    set(string);
			
 
				+  }
			
 
				+
			
 
				+  /** Construct from another text. */
			
 
				+  public Text(Text utf8) {
			
 
				+    set(utf8);
			
 
				+  }
			
 
				+
			
 
				+  /** Construct from a byte array.
			
 
				+   * @exception CharacterCodingExcetpion if the array has invalid UTF8 bytes 
			
 
				+   */
			
 
				+  public Text(byte[] utf8) throws CharacterCodingException {
			
 
				+    set(utf8);
			
 
				+  }
			
 
				+  
			
 
				+  /** Retuns the raw bytes. */
			
 
				+  public byte[] getBytes() {
			
 
				+    return bytes;
			
 
				+  }
			
 
				+
			
 
				+  /** Returns the number of bytes in the byte array */ 
			
 
				+  public int getLength() {
			
 
				+    return length;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Returns the Unicode Scalar Value (32-bit integer value)
			
 
				+   * for the character at <code>position</code>. Note that this
			
 
				+   * method avoids using the converter or doing String instatiation
			
 
				+   * @returns the Unicode scalar value at position or -1
			
 
				+   *          if the position is invalid or points to a
			
 
				+   *          trailing byte
			
 
				+   */
			
 
				+  public int charAt(int position) {
			
 
				+    if (position > this.length) return -1; // too long
			
 
				+    if (position < 0) return -1; // duh.
			
 
				+      
			
 
				+    ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
			
 
				+    return bytesToCodePoint(bb.slice());
			
 
				+  }
			
 
				+  
			
 
				+  public int find(String what) {
			
 
				+    return find(what, 0);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Finds any occurence of <code>what</code> in the backing
			
 
				+   * buffer, starting as position <code>start</code>. The starting
			
 
				+   * position is measured in bytes and the return value is in
			
 
				+   * terms of byte position in the buffer. The backing buffer is
			
 
				+   * not converted to a string for this operation.
			
 
				+   * @return byte position of the first occurence of the search
			
 
				+   *         string in the UTF-8 buffer or -1 if not found
			
 
				+   */
			
 
				+  public int find(String what, int start) {
			
 
				+    try {
			
 
				+      ByteBuffer src = ByteBuffer.wrap(this.bytes);
			
 
				+      ByteBuffer tgt = encode(what);
			
 
				+      byte b = tgt.get();
			
 
				+      src.position(start);
			
 
				+          
			
 
				+      while (src.hasRemaining()) {
			
 
				+        if (b == src.get()) { // matching first byte
			
 
				+          src.mark(); // save position in loop
			
 
				+          tgt.mark(); // save position in target
			
 
				+          boolean found = true;
			
 
				+          int pos = src.position()-1;
			
 
				+          while (tgt.hasRemaining()) {
			
 
				+            if (!src.hasRemaining()) { // src expired first
			
 
				+              tgt.reset();
			
 
				+              src.reset();
			
 
				+              found = false;
			
 
				+              break;
			
 
				+            }
			
 
				+            if (!(tgt.get() == src.get())) {
			
 
				+              tgt.reset();
			
 
				+              src.reset();
			
 
				+              found = false;
			
 
				+              break; // no match
			
 
				+            }
			
 
				+          }
			
 
				+          if (found) return pos;
			
 
				+        }
			
 
				+      }
			
 
				+      return -1; // not found
			
 
				+    } catch (CharacterCodingException e) {
			
 
				+      // can't get here
			
 
				+      e.printStackTrace();
			
 
				+      return -1;
			
 
				+    }
			
 
				+  }  
			
 
				+  /** Set to contain the contents of a string. 
			
 
				+   * @exception CharacterCodingException if the string contains 
			
 
				+   *       invalid codepoints or unpaired surrogate
			
 
				+   */
			
 
				+  public void set(String string) throws CharacterCodingException {
			
 
				+    ByteBuffer bb = encode(string);
			
 
				+    bytes = bb.array();
			
 
				+    length = bb.limit();
			
 
				+  }
			
 
				+
			
 
				+  /** Set to a utf8 byte array
			
 
				+   * @exception CharacterCodingException if the array contains invalid UTF8 code  
			
 
				+   */
			
 
				+  public void set(byte[] utf8) throws CharacterCodingException {
			
 
				+    validateUTF8(utf8);
			
 
				+    set(utf8, utf8.length);
			
 
				+  }
			
 
				+  
			
 
				+  /** copy a text. */
			
 
				+  public void set(Text other) {
			
 
				+    set(other.bytes, other.length);
			
 
				+  }
			
 
				+
			
 
				+  private void set(byte[] utf8, int len ) {
			
 
				+    setCapacity(len);
			
 
				+    System.arraycopy(utf8, 0, bytes, 0, len);
			
 
				+    this.length = len;
			
 
				+  }
			
 
				+
			
 
				+  /*
			
 
				+   * Sets the capacity of this Text object to <em>at least</em>
			
 
				+   * <code>len</code> bytes. If the current buffer is longer,
			
 
				+   * then the capacity and existing content of the buffer are
			
 
				+   * unchanged. If <code>len</code> is larger
			
 
				+   * than the current capacity, the Text object's capacity is
			
 
				+   * increased to match. The existing contents of the buffer
			
 
				+   * (if any) are deleted.
			
 
				+   */
			
 
				+  private void setCapacity( int len ) {
			
 
				+    if (bytes == null || bytes.length < length)
			
 
				+      bytes = new byte[length];      
			
 
				+  }
			
 
				+   
			
 
				+  /** 
			
 
				+   * Convert text back to string
			
 
				+   * @see java.lang.Object#toString()
			
 
				+   */
			
 
				+  public String toString() {
			
 
				+    try {
			
 
				+      return decode(bytes);
			
 
				+    } catch (CharacterCodingException e) { 
			
 
				+      //bytes is supposed to contain valid utf8, therefore, 
			
 
				+      // this should never happen
			
 
				+      return null;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /** deserialize 
			
 
				+   * check if the received bytes are valid utf8 code. 
			
 
				+   * if not throws MalformedInputException
			
 
				+   * @see Writable#readFields(DataInput)
			
 
				+   */
			
 
				+  public void readFields(DataInput in) throws IOException {
			
 
				+    length = WritableUtils.readVInt(in);
			
 
				+    setCapacity(length);
			
 
				+    in.readFully(bytes, 0, length);
			
 
				+    validateUTF8(bytes);
			
 
				+  }
			
 
				+
			
 
				+  /** Skips over one Text in the input. */
			
 
				+  public static void skip(DataInput in) throws IOException {
			
 
				+    int length = WritableUtils.readVInt(in);
			
 
				+    in.skipBytes(length);
			
 
				+  }
			
 
				+
			
 
				+  /** serialize
			
 
				+   * write this object to out
			
 
				+   * length uses zero-compressed encoding
			
 
				+   * @see Writable#write(DataOutput)
			
 
				+   */
			
 
				+  public void write(DataOutput out) throws IOException {
			
 
				+    WritableUtils.writeVInt(out, length); // out.writeInt(length);
			
 
				+    out.write(bytes, 0, length);
			
 
				+  }
			
 
				+
			
 
				+  /** Compare two Texts bytewise using standard UTF8 ordering. */
			
 
				+  public int compareTo(Object o) {
			
 
				+    Text that = (Text)o;
			
 
				+    if(this == that)
			
 
				+      return 0;
			
 
				+    else
			
 
				+      return WritableComparator.compareBytes(bytes, 0, length,
			
 
				+                                             that.bytes, 0, that.length);
			
 
				+  }
			
 
				+
			
 
				+  /** Returns true iff <code>o</code> is a Text with the same contents.  */
			
 
				+  public boolean equals(Object o) {
			
 
				+    if (!(o instanceof Text))
			
 
				+      return false;
			
 
				+    Text that = (Text)o;
			
 
				+    if (this == that)
			
 
				+      return true;
			
 
				+    else if (this.length != that.length)
			
 
				+      return false;
			
 
				+    else
			
 
				+      return WritableComparator.compareBytes(bytes, 0, length,
			
 
				+                                             that.bytes, 0, that.length) == 0;
			
 
				+  }
			
 
				+
			
 
				+  /** hash function */
			
 
				+  public int hashCode() {
			
 
				+    return WritableComparator.hashBytes(bytes, length);
			
 
				+  }
			
 
				+
			
 
				+  /** A WritableComparator optimized for Text keys. */
			
 
				+  public static class Comparator extends WritableComparator {
			
 
				+    public Comparator() {
			
 
				+      super(Text.class);
			
 
				+    }
			
 
				+
			
 
				+    public int compare(byte[] b1, int s1, int l1,
			
 
				+                       byte[] b2, int s2, int l2) {
			
 
				+      try {
			
 
				+        int n1 = readVInt(b1, s1);
			
 
				+        int n2 = readVInt(b2, s2);
			
 
				+        return compareBytes(b1, s1+WritableUtils.getVIntSize(n1), n1, 
			
 
				+                            b2, s2+WritableUtils.getVIntSize(n2), n2);
			
 
				+      }catch(IOException e) {
			
 
				+        LOG.warn(e);
			
 
				+        throw new RuntimeException(e);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  static {
			
 
				+    // register this comparator
			
 
				+    WritableComparator.define(Text.class, new Comparator());
			
 
				+  }
			
 
				+
			
 
				+  /// STATIC UTILITIES FROM HERE DOWN
			
 
				+  /**
			
 
				+   * Converts the provided byte array to a String using the
			
 
				+   * UTF-8 encoding. If the input is malformed,
			
 
				+   * throws a MalformedInputException.
			
 
				+   */
			
 
				+  public static String decode(byte[] utf8) throws CharacterCodingException {
			
 
				+    return decode(ByteBuffer.wrap(utf8), false);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Converts the provided byte array to a String using the
			
 
				+   * UTF-8 encoding. If <code>replace</code> is true, then
			
 
				+   * malformed input is replaced with the
			
 
				+   * substitution character, which is U+FFFD. Otherwise the
			
 
				+   * method throws a MalformedInputException.
			
 
				+   */
			
 
				+  public static String decode(byte[] utf8, boolean replace) 
			
 
				+    throws CharacterCodingException {
			
 
				+    return decode(ByteBuffer.wrap(utf8), replace);
			
 
				+  }
			
 
				+  
			
 
				+  private static String decode(ByteBuffer utf8, boolean replace) 
			
 
				+    throws CharacterCodingException {
			
 
				+    synchronized(DECODER) {
			
 
				+      if (replace) {
			
 
				+        DECODER.onMalformedInput(
			
 
				+                                 java.nio.charset.CodingErrorAction.REPLACE);
			
 
				+        DECODER.onUnmappableCharacter(CodingErrorAction.REPLACE);
			
 
				+      }
			
 
				+      String str = DECODER.decode(utf8).toString();
			
 
				+      // set decoder back to its default value: REPORT
			
 
				+      if (replace) {
			
 
				+        DECODER.onMalformedInput(CodingErrorAction.REPORT);
			
 
				+        DECODER.onUnmappableCharacter(CodingErrorAction.REPORT);
			
 
				+      }
			
 
				+      return str;
			
 
				+    }
			
 
				+
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Converts the provided String to bytes using the
			
 
				+   * UTF-8 encoding. If the input is malformed,
			
 
				+   * throws a MalformedInputException.
			
 
				+   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
			
 
				+   *                     and length is ByteBuffer.limit()
			
 
				+   */
			
 
				+
			
 
				+  public static ByteBuffer encode(String string)
			
 
				+    throws CharacterCodingException {
			
 
				+    return encode(string, false);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Converts the provided String to bytes using the
			
 
				+   * UTF-8 encoding. If <code>replace</code> is true, then
			
 
				+   * malformed input is replaced with the
			
 
				+   * substitution character, which is U+FFFD. Otherwise the
			
 
				+   * method throws a MalformedInputException.
			
 
				+   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
			
 
				+   *                     and length is ByteBuffer.limit()
			
 
				+   */
			
 
				+  public static ByteBuffer encode(String string, boolean replace)
			
 
				+    throws CharacterCodingException {
			
 
				+    synchronized(ENCODER) {
			
 
				+      if (replace) {
			
 
				+        ENCODER.onMalformedInput(CodingErrorAction.REPLACE);
			
 
				+        ENCODER.onUnmappableCharacter(CodingErrorAction.REPLACE);
			
 
				+      }
			
 
				+      ByteBuffer bytes=ENCODER.encode(CharBuffer.wrap(string.toCharArray()));
			
 
				+      if (replace) {
			
 
				+        ENCODER.onMalformedInput(CodingErrorAction.REPORT);
			
 
				+        ENCODER.onUnmappableCharacter(CodingErrorAction.REPORT);
			
 
				+      }
			
 
				+      return bytes;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /** Read a UTF8 encoded string from in
			
 
				+   */
			
 
				+  public static String readString(DataInput in) throws IOException {
			
 
				+    int length = WritableUtils.readVInt(in);
			
 
				+    byte [] bytes = new byte[length];
			
 
				+    in.readFully(bytes, 0, length);
			
 
				+    validateUTF8(bytes);
			
 
				+    return decode(bytes);
			
 
				+  }
			
 
				+
			
 
				+  /** Write a UTF8 encoded string to out
			
 
				+   */
			
 
				+  public static int writeString(DataOutput out, String s) throws IOException {
			
 
				+    ByteBuffer bytes = encode(s);
			
 
				+    int length = bytes.limit();
			
 
				+    WritableUtils.writeVInt(out, length);
			
 
				+    out.write(bytes.array(), 0, length);
			
 
				+    return length;
			
 
				+  }
			
 
				+
			
 
				+  ////// states for validateUTF8
			
 
				+  
			
 
				+  private static final int LEAD_BYTE = 0;
			
 
				+
			
 
				+  private static final int TRAIL_BYTE_1 = 1;
			
 
				+
			
 
				+  private static final int TRAIL_BYTE = 2;
			
 
				+
			
 
				+  /** 
			
 
				+   * Check if a byte array contains valid utf-8
			
 
				+   * @param utf8: byte array
			
 
				+   * @exception MalformedInputException if the byte array contains invalid utf-8
			
 
				+   */
			
 
				+  public static void validateUTF8(byte[] utf8) 
			
 
				+    throws MalformedInputException {
			
 
				+    int count = 0;
			
 
				+    int leadByte = 0;
			
 
				+    int length = 0;
			
 
				+    int state = LEAD_BYTE;
			
 
				+    while (count < utf8.length) {
			
 
				+      int aByte = ((int) utf8[count] & 0xFF);
			
 
				+
			
 
				+      switch (state) {
			
 
				+      case LEAD_BYTE:
			
 
				+        leadByte = aByte;
			
 
				+        length = bytesFromUTF8[aByte];
			
 
				+
			
 
				+        switch (length) {
			
 
				+        case 0: // check for ASCII
			
 
				+          if (leadByte > 0x7E)
			
 
				+            throw new MalformedInputException(count);
			
 
				+          state = TRAIL_BYTE;
			
 
				+          break;
			
 
				+        case 1:
			
 
				+          if (leadByte < 0xC2 || leadByte > 0xDF)
			
 
				+            throw new MalformedInputException(count);
			
 
				+          state = TRAIL_BYTE_1;
			
 
				+          break;
			
 
				+        case 2:
			
 
				+          if (leadByte < 0xE0 || leadByte > 0xEF)
			
 
				+            throw new MalformedInputException(count);
			
 
				+          state = TRAIL_BYTE_1;
			
 
				+          break;
			
 
				+        case 3:
			
 
				+          if (leadByte < 0xF0 || leadByte > 0xF4)
			
 
				+            throw new MalformedInputException(count);
			
 
				+          state = TRAIL_BYTE_1;
			
 
				+          break;
			
 
				+        default:
			
 
				+          // too long! Longest valid UTF-8 is 4 bytes (lead + three)
			
 
				+          // or if < 0 we got a trail byte in the lead byte position
			
 
				+          throw new MalformedInputException(count);
			
 
				+        } // switch (length)
			
 
				+        break;
			
 
				+
			
 
				+      case TRAIL_BYTE_1:
			
 
				+        if (leadByte == 0xF0 && aByte < 0x90)
			
 
				+          throw new MalformedInputException(count);
			
 
				+        if (leadByte == 0xF4 && aByte > 0x8F)
			
 
				+          throw new MalformedInputException(count);
			
 
				+        if (leadByte == 0xE0 && aByte < 0xA0)
			
 
				+          throw new MalformedInputException(count);
			
 
				+        if (leadByte == 0xED && aByte > 0x9F)
			
 
				+          throw new MalformedInputException(count);
			
 
				+        // falls through to regular trail-byte test!!
			
 
				+      case TRAIL_BYTE:
			
 
				+        if (aByte < 0x80 || aByte > 0xBF)
			
 
				+          throw new MalformedInputException(count);
			
 
				+        if (--length == 0) {
			
 
				+          state = LEAD_BYTE;
			
 
				+        } else {
			
 
				+          state = TRAIL_BYTE;
			
 
				+        }
			
 
				+        break;
			
 
				+      } // switch (state)
			
 
				+      count++;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Magic numbers for UTF-8. These are the number of bytes
			
 
				+   * that <em>follow</em> a given lead byte. Trailing bytes
			
 
				+   * have the value -1. The values 4 and 5 are presented in
			
 
				+   * this table, even though valid UTF-8 cannot include the
			
 
				+   * five and six byte sequences.
			
 
				+   */
			
 
				+  static final int[] bytesFromUTF8 =
			
 
				+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0,
			
 
				+    // trail bytes
			
 
				+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
			
 
				+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
			
 
				+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
			
 
				+    1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
			
 
				+    3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
			
 
				+
			
 
				+  /**
			
 
				+   * Returns the next code point at the current position in
			
 
				+   * the buffer. The buffer's position will be incremented.
			
 
				+   * Any mark set on this buffer will be changed by this method!
			
 
				+   */
			
 
				+  public static int bytesToCodePoint(ByteBuffer bytes) {
			
 
				+    bytes.mark();
			
 
				+    byte b = bytes.get();
			
 
				+    bytes.reset();
			
 
				+    int extraBytesToRead = bytesFromUTF8[(int)(b & 0xFF)];
			
 
				+    if (extraBytesToRead < 0) return -1; // trailing byte!
			
 
				+    int ch = 0;
			
 
				+
			
 
				+    switch (extraBytesToRead) {
			
 
				+    case 5: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
			
 
				+    case 4: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
			
 
				+    case 3: ch += (int)(bytes.get() & 0xFF); ch <<= 6;
			
 
				+    case 2: ch += (int)(bytes.get() & 0xFF); ch <<= 6;
			
 
				+    case 1: ch += (int)(bytes.get() & 0xFF); ch <<= 6;
			
 
				+    case 0: ch += (int)(bytes.get() & 0xFF);
			
 
				+    }
			
 
				+    ch -= offsetsFromUTF8[extraBytesToRead];
			
 
				+
			
 
				+    return ch;
			
 
				+  }
			
 
				+
			
 
				+  
			
 
				+  static final int offsetsFromUTF8[] =
			
 
				+  { 0x00000000, 0x00003080,
			
 
				+    0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
			
 
				+
			
 
				+  /**
			
 
				+   * For the given string, returns the number of UTF-8 bytes
			
 
				+   * required to encode the string.
			
 
				+   * @param string text to encode
			
 
				+   * @return number of UTF-8 bytes required to encode
			
 
				+   */
			
 
				+  public static int utf8Length(String string) {
			
 
				+    CharacterIterator iter = new StringCharacterIterator(string);
			
 
				+    char ch = iter.first();
			
 
				+    int size = 0;
			
 
				+    while (ch != CharacterIterator.DONE) {
			
 
				+      if ((ch >= 0xD800) && (ch < 0xDC00)) {
			
 
				+        // surrogate pair?
			
 
				+        char trail = iter.next();
			
 
				+        if ((trail > 0xDBFF) && (trail < 0xE000)) {
			
 
				+          // valid pair
			
 
				+          size += 4;
			
 
				+        } else {
			
 
				+          // invalid pair
			
 
				+          size += 3;
			
 
				+          iter.previous(); // rewind one
			
 
				+        }
			
 
				+      } else if (ch < 0x80) {
			
 
				+        size++;
			
 
				+      } else if (ch < 0x800) {
			
 
				+        size += 2;
			
 
				+      } else {
			
 
				+        // ch < 0x10000, that is, the largest char value
			
 
				+        size += 3;
			
 
				+      }
			
 
				+      ch = iter.next();
			
 
				+    }
			
 
				+    return size;
			
 
				+  }
			
 
				+}
			
--- a/src/java/org/apache/hadoop/io/UTF8.java
+++ b/src/java/org/apache/hadoop/io/UTF8.java
@@ -28,6 +28,7 @@ import org.apache.commons.logging.*;
 
				  * <p>Also includes utilities for efficiently reading and writing UTF-8.
			
 
				  *
			
 
				  * @author Doug Cutting
			
 
				+ * @deprecated replaced by Text
			
 
				  */
			
 
				 public class UTF8 implements WritableComparable {
			
 
				   private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.UTF8");
			
--- a/src/java/org/apache/hadoop/io/WritableComparator.java
+++ b/src/java/org/apache/hadoop/io/WritableComparator.java
@@ -158,4 +158,38 @@ public class WritableComparator implements Comparator {
 
				       (readInt(bytes, start+4) & 0xFFFFFFFFL);
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Reads a zero-compressed encoded long from a byte array and returns it.
			
 
				+   * @param bytes: byte array with decode long
			
 
				+   * @param start: starting index
			
 
				+   * @throws java.io.IOException 
			
 
				+   * @return deserialized long
			
 
				+   */
			
 
				+  static long readVLong(byte[] bytes, int start) throws IOException {
			
 
				+      int len = bytes[start];
			
 
				+      if (len >= -112) {
			
 
				+          return len;
			
 
				+      }
			
 
				+      len = (len < -120) ? -(len + 120) : -(len + 112);
			
 
				+      if (start+1+len>bytes.length)
			
 
				+          throw new IOException(
			
 
				+                  "Not enough number of bytes for a zero-compressed integer");
			
 
				+      long i = 0;
			
 
				+      for (int idx = 0; idx < len; idx++) {
			
 
				+          i = i << 8;
			
 
				+          i = i | (bytes[start+1+idx] & 0xFF);
			
 
				+      }
			
 
				+      return i;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Reads a zero-compressed encoded integer from a byte array and returns it.
			
 
				+   * @param bytes: byte array with the encoded integer
			
 
				+   * @param start: start index
			
 
				+   * @throws java.io.IOException 
			
 
				+   * @return deserialized integer
			
 
				+   */
			
 
				+  static int readVInt(byte[] bytes, int start) throws IOException {
			
 
				+      return (int) readVLong(bytes, start);
			
 
				+  }
			
 
				 }
			
--- a/src/java/org/apache/hadoop/io/WritableUtils.java
+++ b/src/java/org/apache/hadoop/io/WritableUtils.java
@@ -17,6 +17,7 @@
 
				 package org.apache.hadoop.io;
			
 
				 
			
 
				 import java.io.*;
			
 
				+
			
 
				 import org.apache.hadoop.mapred.JobConf;
			
 
				 import java.util.zip.GZIPInputStream;
			
 
				 import java.util.zip.GZIPOutputStream;
			
@@ -230,5 +231,130 @@ public final class WritableUtils  {
 
				       throw new RuntimeException("Error writing/reading clone buffer", e);
			
 
				     }
			
 
				   }
			
 
				+ 
			
 
				+  /**
			
 
				+   * Serializes an integer to a binary stream with zero-compressed encoding.
			
 
				+   * For -120 <= i <= 127, only one byte is used with the actual value.
			
 
				+   * For other values of i, the first byte value indicates whether the
			
 
				+   * integer is positive or negative, and the number of bytes that follow.
			
 
				+   * If the first byte value v is between -121 and -124, the following integer
			
 
				+   * is positive, with number of bytes that follow are -(v+120).
			
 
				+   * If the first byte value v is between -125 and -128, the following integer
			
 
				+   * is negative, with number of bytes that follow are -(v+124). Bytes are
			
 
				+   * stored in the high-non-zero-byte-first order.
			
 
				+   *
			
 
				+   * @param stream Binary output stream
			
 
				+   * @param i Integer to be serialized
			
 
				+   * @throws java.io.IOException 
			
 
				+   */
			
 
				+  public static void writeVInt(DataOutput stream, int i) throws IOException {
			
 
				+      writeVLong(stream, i);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Serializes a long to a binary stream with zero-compressed encoding.
			
 
				+   * For -112 <= i <= 127, only one byte is used with the actual value.
			
 
				+   * For other values of i, the first byte value indicates whether the
			
 
				+   * long is positive or negative, and the number of bytes that follow.
			
 
				+   * If the first byte value v is between -113 and -120, the following long
			
 
				+   * is positive, with number of bytes that follow are -(v+112).
			
 
				+   * If the first byte value v is between -121 and -128, the following long
			
 
				+   * is negative, with number of bytes that follow are -(v+120). Bytes are
			
 
				+   * stored in the high-non-zero-byte-first order.
			
 
				+   * 
			
 
				+   * @param stream Binary output stream
			
 
				+   * @param i Long to be serialized
			
 
				+   * @throws java.io.IOException 
			
 
				+   */
			
 
				+  public static void writeVLong(DataOutput stream, long i) throws IOException {
			
 
				+      if (i >= -112 && i <= 127) {
			
 
				+          stream.writeByte((byte)i);
			
 
				+          return;
			
 
				+      }
			
 
				+      
			
 
				+      int len = -112;
			
 
				+      if (i < 0) {
			
 
				+          i &= 0x7FFFFFFFFFFFFFFFL; // reset the sign bit
			
 
				+          len = -120;
			
 
				+      }
			
 
				+      
			
 
				+      long tmp = i;
			
 
				+      while (tmp != 0) {
			
 
				+          tmp = tmp >> 8;
			
 
				+          len--;
			
 
				+      }
			
 
				+      
			
 
				+      stream.writeByte((byte)len);
			
 
				+      
			
 
				+      len = (len < -120) ? -(len + 120) : -(len + 112);
			
 
				+      
			
 
				+      for (int idx = len; idx != 0; idx--) {
			
 
				+          int shiftbits = (idx - 1) * 8;
			
 
				+          long mask = 0xFFL << shiftbits;
			
 
				+          stream.writeByte((byte)((i & mask) >> shiftbits));
			
 
				+      }
			
 
				+  }
			
 
				+  
			
 
				+
			
 
				+  /**
			
 
				+   * Reads a zero-compressed encoded long from input stream and returns it.
			
 
				+   * @param stream Binary input stream
			
 
				+   * @throws java.io.IOException 
			
 
				+   * @return deserialized long from stream.
			
 
				+   */
			
 
				+  public static long readVLong(DataInput stream) throws IOException {
			
 
				+      int len = stream.readByte();
			
 
				+      if (len >= -112) {
			
 
				+          return len;
			
 
				+      }
			
 
				+      len = (len < -120) ? -(len + 120) : -(len + 112);
			
 
				+      byte[] barr = new byte[len];
			
 
				+      stream.readFully(barr);
			
 
				+      long i = 0;
			
 
				+      for (int idx = 0; idx < len; idx++) {
			
 
				+          i = i << 8;
			
 
				+          i = i | (barr[idx] & 0xFF);
			
 
				+      }
			
 
				+      return i;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Reads a zero-compressed encoded integer from input stream and returns it.
			
 
				+   * @param stream Binary input stream
			
 
				+   * @throws java.io.IOException 
			
 
				+   * @return deserialized integer from stream.
			
 
				+   */
			
 
				+  public static int readVInt(DataInput stream) throws IOException {
			
 
				+      return (int) readVLong(stream);
			
 
				+  }
			
 
				+  
			
 
				+
			
 
				+  /**
			
 
				+   * Get the encoded length if an integer is stored in a variable-length format
			
 
				+   * @param i: a long
			
 
				+   * @return the encoded length 
			
 
				+   */
			
 
				+  
			
 
				+  public static int getVIntSize(long i) {
			
 
				+      if (i >= -112 && i <= 127) {
			
 
				+          return 1;
			
 
				+      }
			
 
				+      
			
 
				+      int len = -112;
			
 
				+      if (i < 0) {
			
 
				+          i &= 0x7FFFFFFFFFFFFFFFL; // reset the sign bit
			
 
				+          len = -120;
			
 
				+      }
			
 
				+      
			
 
				+      long tmp = i;
			
 
				+      while (tmp != 0) {
			
 
				+          tmp = tmp >> 8;
			
 
				+          len--;
			
 
				+      }
			
 
				+      
			
 
				+      len = (len < -120) ? -(len + 120) : -(len + 112);
			
 
				+      
			
 
				+      return len+1;
			
 
				+  }
			
 
				   
			
 
				 }
			
--- a/src/java/org/apache/hadoop/record/BinaryInputArchive.java
+++ b/src/java/org/apache/hadoop/record/BinaryInputArchive.java
@@ -22,6 +22,8 @@ import java.io.ByteArrayOutputStream;
 
				 import java.io.DataInputStream;
			
 
				 import java.io.InputStream;
			
 
				 
			
 
				+import org.apache.hadoop.io.WritableUtils;
			
 
				+
			
 
				 
			
 
				 /**
			
 
				  *
			
@@ -61,11 +63,11 @@ public class BinaryInputArchive implements InputArchive {
 
				     }
			
 
				     
			
 
				     public int readInt(String tag) throws IOException {
			
 
				-        return Utils.readInt(in);
			
 
				+        return WritableUtils.readVInt(in);
			
 
				     }
			
 
				     
			
 
				     public long readLong(String tag) throws IOException {
			
 
				-        return Utils.readLong(in);
			
 
				+        return WritableUtils.readVLong(in);
			
 
				     }
			
 
				     
			
 
				     public float readFloat(String tag) throws IOException {
			
@@ -77,14 +79,14 @@ public class BinaryInputArchive implements InputArchive {
 
				     }
			
 
				     
			
 
				     public String readString(String tag) throws IOException {
			
 
				-        int len = Utils.readInt(in);
			
 
				+        int len = readInt(tag);
			
 
				         byte[] chars = new byte[len];
			
 
				         in.readFully(chars);
			
 
				         return new String(chars, "UTF-8");
			
 
				     }
			
 
				     
			
 
				     public ByteArrayOutputStream readBuffer(String tag) throws IOException {
			
 
				-        int len = Utils.readInt(in);
			
 
				+        int len = readInt(tag);
			
 
				         ByteArrayOutputStream buf = new ByteArrayOutputStream(len);
			
 
				         byte[] arr = new byte[len];
			
 
				         in.readFully(arr);
			
@@ -101,13 +103,13 @@ public class BinaryInputArchive implements InputArchive {
 
				     public void endRecord(String tag) throws IOException {}
			
 
				     
			
 
				     public Index startVector(String tag) throws IOException {
			
 
				-        return new BinaryIndex(Utils.readInt(in));
			
 
				+        return new BinaryIndex(readInt(tag));
			
 
				     }
			
 
				     
			
 
				     public void endVector(String tag) throws IOException {}
			
 
				     
			
 
				     public Index startMap(String tag) throws IOException {
			
 
				-        return new BinaryIndex(Utils.readInt(in));
			
 
				+        return new BinaryIndex(readInt(tag));
			
 
				     }
			
 
				     
			
 
				     public void endMap(String tag) throws IOException {}
			
--- a/src/java/org/apache/hadoop/record/BinaryOutputArchive.java
+++ b/src/java/org/apache/hadoop/record/BinaryOutputArchive.java
@@ -24,6 +24,8 @@ import java.io.DataOutput;
 
				 import java.io.DataOutputStream;
			
 
				 import java.io.OutputStream;
			
 
				 
			
 
				+import org.apache.hadoop.io.WritableUtils;
			
 
				+
			
 
				 /**
			
 
				  *
			
 
				  * @author Milind Bhandarkar
			
@@ -50,11 +52,11 @@ public class BinaryOutputArchive implements OutputArchive {
 
				     }
			
 
				     
			
 
				     public void writeInt(int i, String tag) throws IOException {
			
 
				-        Utils.writeInt(out, i);
			
 
				+        WritableUtils.writeVInt(out, i);
			
 
				     }
			
 
				     
			
 
				     public void writeLong(long l, String tag) throws IOException {
			
 
				-        Utils.writeLong(out, l);
			
 
				+        WritableUtils.writeVLong(out, l);
			
 
				     }
			
 
				     
			
 
				     public void writeFloat(float f, String tag) throws IOException {
			
@@ -67,14 +69,14 @@ public class BinaryOutputArchive implements OutputArchive {
 
				     
			
 
				     public void writeString(String s, String tag) throws IOException {
			
 
				         byte[] chars = s.getBytes("UTF-8");
			
 
				-        Utils.writeInt(out, chars.length);
			
 
				+        writeInt(chars.length, tag);
			
 
				         out.write(chars);
			
 
				     }
			
 
				     
			
 
				     public void writeBuffer(ByteArrayOutputStream buf, String tag)
			
 
				     throws IOException {
			
 
				         byte[] barr = buf.toByteArray();
			
 
				-        Utils.writeInt(out, barr.length);
			
 
				+        writeInt(barr.length, tag);
			
 
				         out.write(barr);
			
 
				     }
			
 
				     
			
@@ -87,13 +89,13 @@ public class BinaryOutputArchive implements OutputArchive {
 
				     public void endRecord(Record r, String tag) throws IOException {}
			
 
				     
			
 
				     public void startVector(ArrayList v, String tag) throws IOException {
			
 
				-        Utils.writeInt(out, v.size());
			
 
				+        writeInt(v.size(), tag);
			
 
				     }
			
 
				     
			
 
				     public void endVector(ArrayList v, String tag) throws IOException {}
			
 
				     
			
 
				     public void startMap(TreeMap v, String tag) throws IOException {
			
 
				-        Utils.writeInt(out, v.size());
			
 
				+        writeInt(v.size(), tag);
			
 
				     }
			
 
				     
			
 
				     public void endMap(TreeMap v, String tag) throws IOException {}
			
--- a/src/java/org/apache/hadoop/record/Utils.java
+++ b/src/java/org/apache/hadoop/record/Utils.java
@@ -46,7 +46,8 @@ public class Utils {
 
				      * @param stream Binary output stream
			
 
				      * @param i Integer to be serialized
			
 
				      * @throws java.io.IOException 
			
 
				-     */
			
 
				+     * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.writeVInt}
			
 
				+      */
			
 
				     static void writeInt(DataOutput stream, int i) throws IOException {
			
 
				         if (i >= -120 && i <= 127) {
			
 
				             stream.writeByte((byte)i);
			
@@ -89,7 +90,8 @@ public class Utils {
 
				      * 
			
 
				      * @param stream Binary output stream
			
 
				      * @param i Long to be serialized
			
 
				-     * @throws java.io.IOException 
			
 
				+     * @throws java.io.IOException
			
 
				+     * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.writeVLong}
			
 
				      */
			
 
				     static void writeLong(DataOutput stream, long i) throws IOException {
			
 
				         if (i >= -112 && i <= 127) {
			
@@ -125,6 +127,7 @@ public class Utils {
 
				      * @param stream Binary input stream
			
 
				      * @throws java.io.IOException 
			
 
				      * @return deserialized integer from stream.
			
 
				+     * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.readVInt}
			
 
				      */
			
 
				     static int readInt(DataInput stream) throws IOException {
			
 
				         int len = stream.readByte();
			
@@ -147,7 +150,8 @@ public class Utils {
 
				      * @param stream Binary input stream
			
 
				      * @throws java.io.IOException 
			
 
				      * @return deserialized long from stream.
			
 
				-     */
			
 
				+     * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.readVLong}
			
 
				+      */
			
 
				     static long readLong(DataInput stream) throws IOException {
			
 
				         int len = stream.readByte();
			
 
				         if (len >= -112) {
			
--- a/src/test/org/apache/hadoop/io/TestText.java
+++ b/src/test/org/apache/hadoop/io/TestText.java
@@ -0,0 +1,223 @@
 
				+/**
			
 
				+ * Copyright 2005 The Apache Software Foundation
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.io;
			
 
				+
			
 
				+import junit.framework.TestCase;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.nio.ByteBuffer;
			
 
				+import java.nio.charset.CharacterCodingException;
			
 
				+import java.util.Random;
			
 
				+
			
 
				+import org.apache.commons.logging.Log;
			
 
				+import org.apache.commons.logging.LogFactory;
			
 
				+
			
 
				+/** Unit tests for LargeUTF8. */
			
 
				+public class TestText extends TestCase {
			
 
				+  private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.TestText");
			
 
				+  private static final int NUM_ITERATIONS = 100;
			
 
				+  public TestText(String name) { super(name); }
			
 
				+
			
 
				+  private static final Random RANDOM = new Random(1);
			
 
				+
			
 
				+  private static final int RAND_LEN = -1;
			
 
				+  
			
 
				+  // generate a valid java String
			
 
				+  private static String getTestString(int len) throws Exception {
			
 
				+    StringBuffer buffer = new StringBuffer();    
			
 
				+    int length = (len==RAND_LEN) ? RANDOM.nextInt(1000) : len;
			
 
				+    while (buffer.length()<length) {
			
 
				+        int codePoint = RANDOM.nextInt(Character.MAX_CODE_POINT);
			
 
				+        char tmpStr[] = new char[2];
			
 
				+        if(Character.isDefined(codePoint)) {
			
 
				+            //unpaired surrogate
			
 
				+            if(codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT &&
			
 
				+                    !Character.isHighSurrogate((char)codePoint) &&
			
 
				+                    !Character.isLowSurrogate((char)codePoint) ) {
			
 
				+               Character.toChars(codePoint, tmpStr, 0);
			
 
				+               buffer.append(tmpStr);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    return buffer.toString();
			
 
				+  }
			
 
				+  
			
 
				+  public static String getTestString() throws Exception {
			
 
				+    return getTestString(RAND_LEN);
			
 
				+  }
			
 
				+  
			
 
				+  public static String getLongString() throws Exception {
			
 
				+      String str = getTestString();
			
 
				+      int length = Short.MAX_VALUE+str.length();
			
 
				+      StringBuffer buffer = new StringBuffer();
			
 
				+      while(buffer.length()<length)
			
 
				+          buffer.append(str);
			
 
				+      
			
 
				+      return buffer.toString();
			
 
				+  }
			
 
				+
			
 
				+  public void testWritable() throws Exception {
			
 
				+    for (int i = 0; i < NUM_ITERATIONS; i++) {
			
 
				+      try {
			
 
				+        String str;
			
 
				+        if(i == 0 )
			
 
				+            str = getLongString();
			
 
				+        else
			
 
				+            str = getTestString();
			
 
				+        TestWritable.testWritable(new Text(str));
			
 
				+      } catch (IOException e) {
			
 
				+          LOG.info(e);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+  public void testCoding() throws Exception {
			
 
				+    for (int i = 0; i < NUM_ITERATIONS; i++) {
			
 
				+      try {
			
 
				+          // generate a random string
			
 
				+          String before;
			
 
				+          if(i == 0 )
			
 
				+              before = getLongString();
			
 
				+          else
			
 
				+              before = getTestString();
			
 
				+    
			
 
				+          // test string to utf8
			
 
				+          ByteBuffer bb = Text.encode(before);
			
 
				+          
			
 
				+          byte[] utf8Text = bb.array();
			
 
				+          byte[] utf8Java = before.getBytes("UTF-8");
			
 
				+          assertEquals(0, WritableComparator.compareBytes(
			
 
				+                      utf8Text, 0, bb.limit(),
			
 
				+                      utf8Java, 0, utf8Java.length));
			
 
				+              
			
 
				+          // test utf8 to string
			
 
				+          String after = Text.decode(utf8Java);
			
 
				+          assertTrue(before.equals(after));
			
 
				+      }catch(CharacterCodingException e) {
			
 
				+          LOG.info( e );
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  
			
 
				+  public void testIO() throws Exception {
			
 
				+    DataOutputBuffer out = new DataOutputBuffer();
			
 
				+    DataInputBuffer in = new DataInputBuffer();
			
 
				+
			
 
				+    for (int i = 0; i < NUM_ITERATIONS; i++) {
			
 
				+        try {
			
 
				+          // generate a random string
			
 
				+          String before;          
			
 
				+          if(i == 0 )
			
 
				+              before = getLongString();
			
 
				+          else
			
 
				+              before = getTestString();
			
 
				+
			
 
				+          // write it
			
 
				+          out.reset();
			
 
				+          Text.writeString(out, before);
			
 
				+
			
 
				+          // test that it reads correctly
			
 
				+          in.reset(out.getData(), out.getLength());
			
 
				+          String after = Text.readString(in);
			
 
				+          assertTrue(before.equals(after));
			
 
				+    
			
 
				+          // Test compatibility with Java's other decoder 
			
 
				+          int strLenSize = WritableUtils.getVIntSize(Text.utf8Length(before));
			
 
				+          String after2 = new String(out.getData(), strLenSize, 
			
 
				+          out.getLength()-strLenSize, "UTF-8");
			
 
				+              assertTrue(before.equals(after2));
			
 
				+        }catch(IOException e) {
			
 
				+            LOG.info(e);
			
 
				+        }
			
 
				+      }
			
 
				+  }
			
 
				+
			
 
				+  public void testCompare() throws Exception {
			
 
				+      DataOutputBuffer out1 = new DataOutputBuffer();
			
 
				+      DataOutputBuffer out2 = new DataOutputBuffer();
			
 
				+      DataOutputBuffer out3 = new DataOutputBuffer();
			
 
				+      Text.Comparator comparator = new Text.Comparator();
			
 
				+      for (int i=0; i<NUM_ITERATIONS; i++ ) {
			
 
				+        try {
			
 
				+          // reset output buffer
			
 
				+          out1.reset();
			
 
				+          out2.reset();
			
 
				+          out3.reset();
			
 
				+
			
 
				+          // generate two random strings
			
 
				+          String str1 = getTestString();
			
 
				+          String str2 = getTestString();
			
 
				+          if(i == 0 ) {
			
 
				+              str1 = getLongString();
			
 
				+              str2 = getLongString();
			
 
				+          } else {
			
 
				+              str1 = getTestString();
			
 
				+              str2 = getTestString();
			
 
				+          }
			
 
				+          
			
 
				+          // convert to texts
			
 
				+          Text txt1 = new Text(str1);
			
 
				+          Text txt2 = new Text(str2);
			
 
				+          Text txt3 = new Text(str1);
			
 
				+          
			
 
				+          // serialize them
			
 
				+          txt1.write(out1);
			
 
				+          txt2.write(out2);
			
 
				+          txt3.write(out3);
			
 
				+          
			
 
				+          // compare two strings by looking at their binary formats
			
 
				+          int ret1 = comparator.compare(out1.getData(), 0, out1.getLength(),
			
 
				+                  out2.getData(), 0, out2.getLength());
			
 
				+          // compare two strings
			
 
				+          int ret2 = txt1.compareTo(txt2);
			
 
				+          
			
 
				+          assertEquals(ret1, ret2);
			
 
				+          
			
 
				+          // test equal
			
 
				+          assertEquals(txt1.compareTo(txt3), 0);
			
 
				+          assertEquals(comparator.compare(out1.getData(), 0, out3.getLength(),
			
 
				+                  out3.getData(), 0, out3.getLength()), 0);
			
 
				+        } catch (IOException e) {
			
 
				+            LOG.info(e);
			
 
				+        }
			
 
				+      }
			
 
				+  }
			
 
				+      
			
 
				+  public void testFind() throws Exception {
			
 
				+      try {
			
 
				+          Text text = new Text("abcd\u20acbdcd\u20ac");
			
 
				+          assertTrue(text.find("abd")==-1);
			
 
				+          assertTrue(text.find("ac")==-1);
			
 
				+          assertTrue(text.find("\u20ac")==4);
			
 
				+          assertTrue(text.find("\u20ac", 5)==11);
			
 
				+      } catch( CharacterCodingException e) {
			
 
				+          LOG.warn(e);
			
 
				+      }
			
 
				+  }
			
 
				+  
			
 
				+  public static void main(String[] args)  throws Exception
			
 
				+  {
			
 
				+    TestText test = new TestText("main");
			
 
				+    test.testIO();
			
 
				+    test.testCompare();
			
 
				+    test.testCoding();
			
 
				+    test.testWritable();
			
 
				+    test.testFind();
			
 
				+  }
			
 
				+}