|
@@ -24,11 +24,11 @@ import java.io.DataOutput;
|
|
|
import java.nio.ByteBuffer;
|
|
|
import java.nio.CharBuffer;
|
|
|
import java.nio.charset.CharacterCodingException;
|
|
|
-import java.nio.charset.Charset;
|
|
|
import java.nio.charset.CharsetDecoder;
|
|
|
import java.nio.charset.CharsetEncoder;
|
|
|
import java.nio.charset.CodingErrorAction;
|
|
|
import java.nio.charset.MalformedInputException;
|
|
|
+import java.nio.charset.StandardCharsets;
|
|
|
import java.text.CharacterIterator;
|
|
|
import java.text.StringCharacterIterator;
|
|
|
import java.util.Arrays;
|
|
@@ -52,63 +52,67 @@ import org.apache.hadoop.classification.InterfaceStability;
|
|
|
@InterfaceStability.Stable
|
|
|
public class Text extends BinaryComparable
|
|
|
implements WritableComparable<BinaryComparable> {
|
|
|
-
|
|
|
+
|
|
|
private static final ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
|
|
|
new ThreadLocal<CharsetEncoder>() {
|
|
|
@Override
|
|
|
protected CharsetEncoder initialValue() {
|
|
|
- return Charset.forName("UTF-8").newEncoder().
|
|
|
+ return StandardCharsets.UTF_8.newEncoder().
|
|
|
onMalformedInput(CodingErrorAction.REPORT).
|
|
|
onUnmappableCharacter(CodingErrorAction.REPORT);
|
|
|
}
|
|
|
};
|
|
|
-
|
|
|
+
|
|
|
private static final ThreadLocal<CharsetDecoder> DECODER_FACTORY =
|
|
|
new ThreadLocal<CharsetDecoder>() {
|
|
|
@Override
|
|
|
protected CharsetDecoder initialValue() {
|
|
|
- return Charset.forName("UTF-8").newDecoder().
|
|
|
+ return StandardCharsets.UTF_8.newDecoder().
|
|
|
onMalformedInput(CodingErrorAction.REPORT).
|
|
|
onUnmappableCharacter(CodingErrorAction.REPORT);
|
|
|
}
|
|
|
};
|
|
|
-
|
|
|
- private static final byte [] EMPTY_BYTES = new byte[0];
|
|
|
-
|
|
|
- private byte[] bytes;
|
|
|
- private int length;
|
|
|
|
|
|
+ private static final byte[] EMPTY_BYTES = new byte[0];
|
|
|
+
|
|
|
+ private byte[] bytes = EMPTY_BYTES;
|
|
|
+ private int length = 0;
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Construct an empty text string.
|
|
|
+ */
|
|
|
public Text() {
|
|
|
- bytes = EMPTY_BYTES;
|
|
|
}
|
|
|
|
|
|
- /** Construct from a string.
|
|
|
+ /**
|
|
|
+ * Construct from a string.
|
|
|
*/
|
|
|
public Text(String string) {
|
|
|
set(string);
|
|
|
}
|
|
|
|
|
|
- /** Construct from another text. */
|
|
|
+ /**
|
|
|
+ * Construct from another text.
|
|
|
+ */
|
|
|
public Text(Text utf8) {
|
|
|
set(utf8);
|
|
|
}
|
|
|
|
|
|
- /** Construct from a byte array.
|
|
|
+ /**
|
|
|
+ * Construct from a byte array.
|
|
|
*/
|
|
|
public Text(byte[] utf8) {
|
|
|
set(utf8);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
* Get a copy of the bytes that is exactly the length of the data.
|
|
|
* See {@link #getBytes()} for faster access to the underlying array.
|
|
|
*/
|
|
|
public byte[] copyBytes() {
|
|
|
- byte[] result = new byte[length];
|
|
|
- System.arraycopy(bytes, 0, result, 0, length);
|
|
|
- return result;
|
|
|
+ return Arrays.copyOf(bytes, length);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
* Returns the raw bytes; however, only data up to {@link #getLength()} is
|
|
|
* valid. Please use {@link #copyBytes()} if you
|
|
@@ -119,12 +123,14 @@ public class Text extends BinaryComparable
|
|
|
return bytes;
|
|
|
}
|
|
|
|
|
|
- /** Returns the number of bytes in the byte array */
|
|
|
+ /**
|
|
|
+ * Returns the number of bytes in the byte array.
|
|
|
+ */
|
|
|
@Override
|
|
|
public int getLength() {
|
|
|
return length;
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
* Returns the Unicode Scalar Value (32-bit integer value)
|
|
|
* for the character at <code>position</code>. Note that this
|
|
@@ -136,15 +142,15 @@ public class Text extends BinaryComparable
|
|
|
public int charAt(int position) {
|
|
|
if (position > this.length) return -1; // too long
|
|
|
if (position < 0) return -1; // duh.
|
|
|
-
|
|
|
+
|
|
|
ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
|
|
|
return bytesToCodePoint(bb.slice());
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
public int find(String what) {
|
|
|
return find(what, 0);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
* Finds any occurrence of <code>what</code> in the backing
|
|
|
* buffer, starting as position <code>start</code>. The starting
|
|
@@ -156,11 +162,11 @@ public class Text extends BinaryComparable
|
|
|
*/
|
|
|
public int find(String what, int start) {
|
|
|
try {
|
|
|
- ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
|
|
|
+ ByteBuffer src = ByteBuffer.wrap(this.bytes, 0, this.length);
|
|
|
ByteBuffer tgt = encode(what);
|
|
|
byte b = tgt.get();
|
|
|
src.position(start);
|
|
|
-
|
|
|
+
|
|
|
while (src.hasRemaining()) {
|
|
|
if (b == src.get()) { // matching first byte
|
|
|
src.mark(); // save position in loop
|
|
@@ -186,54 +192,63 @@ public class Text extends BinaryComparable
|
|
|
}
|
|
|
return -1; // not found
|
|
|
} catch (CharacterCodingException e) {
|
|
|
- // can't get here
|
|
|
- e.printStackTrace();
|
|
|
- return -1;
|
|
|
+ throw new RuntimeException("Should not have happened", e);
|
|
|
}
|
|
|
- }
|
|
|
- /** Set to contain the contents of a string.
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Set to contain the contents of a string.
|
|
|
*/
|
|
|
public void set(String string) {
|
|
|
try {
|
|
|
ByteBuffer bb = encode(string, true);
|
|
|
bytes = bb.array();
|
|
|
length = bb.limit();
|
|
|
- }catch(CharacterCodingException e) {
|
|
|
- throw new RuntimeException("Should not have happened ", e);
|
|
|
+ } catch (CharacterCodingException e) {
|
|
|
+ throw new RuntimeException("Should not have happened", e);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /** Set to a utf8 byte array
|
|
|
+ /**
|
|
|
+ * Set to a utf8 byte array.
|
|
|
*/
|
|
|
public void set(byte[] utf8) {
|
|
|
set(utf8, 0, utf8.length);
|
|
|
}
|
|
|
-
|
|
|
- /** copy a text. */
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Copy a text.
|
|
|
+ */
|
|
|
public void set(Text other) {
|
|
|
set(other.getBytes(), 0, other.getLength());
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * Set the Text to range of bytes
|
|
|
+ * Set the Text to range of bytes.
|
|
|
+ *
|
|
|
* @param utf8 the data to copy from
|
|
|
* @param start the first position of the new string
|
|
|
* @param len the number of bytes of the new string
|
|
|
*/
|
|
|
public void set(byte[] utf8, int start, int len) {
|
|
|
- setCapacity(len, false);
|
|
|
+ ensureCapacity(len);
|
|
|
System.arraycopy(utf8, start, bytes, 0, len);
|
|
|
this.length = len;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * Append a range of bytes to the end of the given text
|
|
|
+ * Append a range of bytes to the end of the given text.
|
|
|
+ *
|
|
|
* @param utf8 the data to copy from
|
|
|
* @param start the first position to append from utf8
|
|
|
* @param len the number of bytes to append
|
|
|
*/
|
|
|
public void append(byte[] utf8, int start, int len) {
|
|
|
- setCapacity(length + len, true);
|
|
|
+ byte[] original = bytes;
|
|
|
+ int capacity = Math.max(length + len, length + (length >> 1));
|
|
|
+ if (ensureCapacity(capacity)) {
|
|
|
+ System.arraycopy(original, 0, bytes, 0, length);
|
|
|
+ }
|
|
|
System.arraycopy(utf8, start, bytes, length, len);
|
|
|
length += len;
|
|
|
}
|
|
@@ -250,47 +265,39 @@ public class Text extends BinaryComparable
|
|
|
length = 0;
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
+ /**
|
|
|
* Sets the capacity of this Text object to <em>at least</em>
|
|
|
- * <code>len</code> bytes. If the current buffer is longer,
|
|
|
- * then the capacity and existing content of the buffer are
|
|
|
- * unchanged. If <code>len</code> is larger
|
|
|
- * than the current capacity, the Text object's capacity is
|
|
|
- * increased to match.
|
|
|
- * @param len the number of bytes we need
|
|
|
- * @param keepData should the old data be kept
|
|
|
- */
|
|
|
- private void setCapacity(int len, boolean keepData) {
|
|
|
- if (bytes == null || bytes.length < len) {
|
|
|
- if (bytes != null && keepData) {
|
|
|
- bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
|
|
|
- } else {
|
|
|
- bytes = new byte[len];
|
|
|
- }
|
|
|
+ * <code>capacity</code> bytes. If the current buffer is longer, then the
|
|
|
+ * capacity and existing content of the buffer are unchanged. If
|
|
|
+ * <code>capacity</code> is larger than the current capacity, the Text
|
|
|
+ * object's capacity is increased to match and any existing data is lost.
|
|
|
+ *
|
|
|
+ * @param capacity the number of bytes we need
|
|
|
+ * @return true if the internal array was resized or false otherwise
|
|
|
+ */
|
|
|
+ private boolean ensureCapacity(final int capacity) {
|
|
|
+ if (bytes.length < capacity) {
|
|
|
+ bytes = new byte[capacity];
|
|
|
+ return true;
|
|
|
}
|
|
|
+ return false;
|
|
|
}
|
|
|
-
|
|
|
- /**
|
|
|
- * Convert text back to string
|
|
|
- * @see java.lang.Object#toString()
|
|
|
- */
|
|
|
+
|
|
|
@Override
|
|
|
public String toString() {
|
|
|
try {
|
|
|
return decode(bytes, 0, length);
|
|
|
} catch (CharacterCodingException e) {
|
|
|
- throw new RuntimeException("Should not have happened " , e);
|
|
|
+ throw new RuntimeException("Should not have happened", e);
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- /** deserialize
|
|
|
- */
|
|
|
+
|
|
|
@Override
|
|
|
public void readFields(DataInput in) throws IOException {
|
|
|
int newLength = WritableUtils.readVInt(in);
|
|
|
readWithKnownLength(in, newLength);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
public void readFields(DataInput in, int maxLength) throws IOException {
|
|
|
int newLength = WritableUtils.readVInt(in);
|
|
|
if (newLength < 0) {
|
|
@@ -303,7 +310,9 @@ public class Text extends BinaryComparable
|
|
|
readWithKnownLength(in, newLength);
|
|
|
}
|
|
|
|
|
|
- /** Skips over one Text in the input. */
|
|
|
+ /**
|
|
|
+ * Skips over one Text in the input.
|
|
|
+ */
|
|
|
public static void skip(DataInput in) throws IOException {
|
|
|
int length = WritableUtils.readVInt(in);
|
|
|
WritableUtils.skipFully(in, length);
|
|
@@ -315,14 +324,14 @@ public class Text extends BinaryComparable
|
|
|
* format.
|
|
|
*/
|
|
|
public void readWithKnownLength(DataInput in, int len) throws IOException {
|
|
|
- setCapacity(len, false);
|
|
|
+ ensureCapacity(len);
|
|
|
in.readFully(bytes, 0, len);
|
|
|
length = len;
|
|
|
}
|
|
|
|
|
|
- /** serialize
|
|
|
- * write this object to out
|
|
|
- * length uses zero-compressed encoding
|
|
|
+ /**
|
|
|
+ * Serialize. Write this object to out length uses zero-compressed encoding.
|
|
|
+ *
|
|
|
* @see Writable#write(DataOutput)
|
|
|
*/
|
|
|
@Override
|
|
@@ -341,7 +350,10 @@ public class Text extends BinaryComparable
|
|
|
out.write(bytes, 0, length);
|
|
|
}
|
|
|
|
|
|
- /** Returns true iff <code>o</code> is a Text with the same contents. */
|
|
|
+ /**
|
|
|
+ * Returns true iff <code>o</code> is a Text with the same length and same
|
|
|
+ * contents.
|
|
|
+ */
|
|
|
@Override
|
|
|
public boolean equals(Object o) {
|
|
|
if (o instanceof Text)
|
|
@@ -365,7 +377,7 @@ public class Text extends BinaryComparable
|
|
|
byte[] b2, int s2, int l2) {
|
|
|
int n1 = WritableUtils.decodeVIntSize(b1[s1]);
|
|
|
int n2 = WritableUtils.decodeVIntSize(b2[s2]);
|
|
|
- return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
|
|
|
+ return compareBytes(b1, s1 + n1, l1 - n1, b2, s2 + n2, l2 - n2);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -383,12 +395,12 @@ public class Text extends BinaryComparable
|
|
|
public static String decode(byte[] utf8) throws CharacterCodingException {
|
|
|
return decode(ByteBuffer.wrap(utf8), true);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
public static String decode(byte[] utf8, int start, int length)
|
|
|
throws CharacterCodingException {
|
|
|
return decode(ByteBuffer.wrap(utf8, start, length), true);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
* Converts the provided byte array to a String using the
|
|
|
* UTF-8 encoding. If <code>replace</code> is true, then
|
|
@@ -400,7 +412,7 @@ public class Text extends BinaryComparable
|
|
|
throws CharacterCodingException {
|
|
|
return decode(ByteBuffer.wrap(utf8, start, length), replace);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
private static String decode(ByteBuffer utf8, boolean replace)
|
|
|
throws CharacterCodingException {
|
|
|
CharsetDecoder decoder = DECODER_FACTORY.get();
|
|
@@ -463,7 +475,7 @@ public class Text extends BinaryComparable
|
|
|
public static String readString(DataInput in) throws IOException {
|
|
|
return readString(in, Integer.MAX_VALUE);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/** Read a UTF8 encoded string with a maximum size
|
|
|
*/
|
|
|
public static String readString(DataInput in, int maxLength)
|
|
@@ -473,8 +485,9 @@ public class Text extends BinaryComparable
|
|
|
in.readFully(bytes, 0, length);
|
|
|
return decode(bytes);
|
|
|
}
|
|
|
-
|
|
|
- /** Write a UTF8 encoded string to out
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Write a UTF8 encoded string to out.
|
|
|
*/
|
|
|
public static int writeString(DataOutput out, String s) throws IOException {
|
|
|
ByteBuffer bytes = encode(s);
|
|
@@ -484,7 +497,8 @@ public class Text extends BinaryComparable
|
|
|
return length;
|
|
|
}
|
|
|
|
|
|
- /** Write a UTF8 encoded string with a maximum size to out
|
|
|
+ /**
|
|
|
+ * Write a UTF8 encoded string with a maximum size to out.
|
|
|
*/
|
|
|
public static int writeString(DataOutput out, String s, int maxLength)
|
|
|
throws IOException {
|
|
@@ -501,24 +515,26 @@ public class Text extends BinaryComparable
|
|
|
}
|
|
|
|
|
|
////// states for validateUTF8
|
|
|
-
|
|
|
+
|
|
|
private static final int LEAD_BYTE = 0;
|
|
|
|
|
|
private static final int TRAIL_BYTE_1 = 1;
|
|
|
|
|
|
private static final int TRAIL_BYTE = 2;
|
|
|
|
|
|
- /**
|
|
|
- * Check if a byte array contains valid utf-8
|
|
|
+ /**
|
|
|
+ * Check if a byte array contains valid UTF-8.
|
|
|
+ *
|
|
|
* @param utf8 byte array
|
|
|
- * @throws MalformedInputException if the byte array contains invalid utf-8
|
|
|
+ * @throws MalformedInputException if the byte array contains invalid UTF-8
|
|
|
*/
|
|
|
public static void validateUTF8(byte[] utf8) throws MalformedInputException {
|
|
|
- validateUTF8(utf8, 0, utf8.length);
|
|
|
+ validateUTF8(utf8, 0, utf8.length);
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
- * Check to see if a byte array is valid utf-8
|
|
|
+ * Check to see if a byte array is valid UTF-8.
|
|
|
+ *
|
|
|
* @param utf8 the array of bytes
|
|
|
* @param start the offset of the first byte in the array
|
|
|
* @param len the length of the byte sequence
|
|
@@ -641,7 +657,6 @@ public class Text extends BinaryComparable
|
|
|
return ch;
|
|
|
}
|
|
|
|
|
|
-
|
|
|
static final int offsetsFromUTF8[] =
|
|
|
{ 0x00000000, 0x00003080,
|
|
|
0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
|