|
@@ -21,6 +21,7 @@ package org.apache.hadoop.io;
|
|
import java.io.IOException;
|
|
import java.io.IOException;
|
|
import java.io.DataInput;
|
|
import java.io.DataInput;
|
|
import java.io.DataOutput;
|
|
import java.io.DataOutput;
|
|
|
|
+import java.io.UTFDataFormatException;
|
|
|
|
|
|
import org.apache.hadoop.util.StringUtils;
|
|
import org.apache.hadoop.util.StringUtils;
|
|
|
|
|
|
@@ -155,6 +156,21 @@ public class UTF8 implements WritableComparable<UTF8> {
|
|
}
|
|
}
|
|
return buffer.toString();
|
|
return buffer.toString();
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Convert to a string, checking for valid UTF8.
|
|
|
|
+ * @return the converted string
|
|
|
|
+ * @throws UTFDataFormatException if the underlying bytes contain invalid
|
|
|
|
+ * UTF8 data.
|
|
|
|
+ */
|
|
|
|
+ public String toStringChecked() throws IOException {
|
|
|
|
+ StringBuilder buffer = new StringBuilder(length);
|
|
|
|
+ synchronized (IBUF) {
|
|
|
|
+ IBUF.reset(bytes, length);
|
|
|
|
+ readChars(IBUF, buffer, length);
|
|
|
|
+ }
|
|
|
|
+ return buffer.toString();
|
|
|
|
+ }
|
|
|
|
|
|
/** Returns true iff <code>o</code> is a UTF8 with the same contents. */
|
|
/** Returns true iff <code>o</code> is a UTF8 with the same contents. */
|
|
@Override
|
|
@Override
|
|
@@ -238,7 +254,7 @@ public class UTF8 implements WritableComparable<UTF8> {
|
|
}
|
|
}
|
|
|
|
|
|
private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
|
|
private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
|
|
- throws IOException {
|
|
|
|
|
|
+ throws UTFDataFormatException, IOException {
|
|
DataOutputBuffer obuf = OBUF_FACTORY.get();
|
|
DataOutputBuffer obuf = OBUF_FACTORY.get();
|
|
obuf.reset();
|
|
obuf.reset();
|
|
obuf.write(in, nBytes);
|
|
obuf.write(in, nBytes);
|
|
@@ -250,15 +266,27 @@ public class UTF8 implements WritableComparable<UTF8> {
|
|
// 0b0xxxxxxx: 1-byte sequence
|
|
// 0b0xxxxxxx: 1-byte sequence
|
|
buffer.append((char)(b & 0x7F));
|
|
buffer.append((char)(b & 0x7F));
|
|
} else if ((b & 0xE0) == 0xC0) {
|
|
} else if ((b & 0xE0) == 0xC0) {
|
|
|
|
+ if (i >= nBytes) {
|
|
|
|
+ throw new UTFDataFormatException("Truncated UTF8 at " +
|
|
|
|
+ StringUtils.byteToHexString(bytes, i - 1, 1));
|
|
|
|
+ }
|
|
// 0b110xxxxx: 2-byte sequence
|
|
// 0b110xxxxx: 2-byte sequence
|
|
buffer.append((char)(((b & 0x1F) << 6)
|
|
buffer.append((char)(((b & 0x1F) << 6)
|
|
| (bytes[i++] & 0x3F)));
|
|
| (bytes[i++] & 0x3F)));
|
|
} else if ((b & 0xF0) == 0xE0) {
|
|
} else if ((b & 0xF0) == 0xE0) {
|
|
// 0b1110xxxx: 3-byte sequence
|
|
// 0b1110xxxx: 3-byte sequence
|
|
|
|
+ if (i + 1 >= nBytes) {
|
|
|
|
+ throw new UTFDataFormatException("Truncated UTF8 at " +
|
|
|
|
+ StringUtils.byteToHexString(bytes, i - 1, 2));
|
|
|
|
+ }
|
|
buffer.append((char)(((b & 0x0F) << 12)
|
|
buffer.append((char)(((b & 0x0F) << 12)
|
|
| ((bytes[i++] & 0x3F) << 6)
|
|
| ((bytes[i++] & 0x3F) << 6)
|
|
| (bytes[i++] & 0x3F)));
|
|
| (bytes[i++] & 0x3F)));
|
|
} else if ((b & 0xF8) == 0xF0) {
|
|
} else if ((b & 0xF8) == 0xF0) {
|
|
|
|
+ if (i + 2 >= nBytes) {
|
|
|
|
+ throw new UTFDataFormatException("Truncated UTF8 at " +
|
|
|
|
+ StringUtils.byteToHexString(bytes, i - 1, 3));
|
|
|
|
+ }
|
|
// 0b11110xxx: 4-byte sequence
|
|
// 0b11110xxx: 4-byte sequence
|
|
int codepoint =
|
|
int codepoint =
|
|
((b & 0x07) << 18)
|
|
((b & 0x07) << 18)
|
|
@@ -274,8 +302,8 @@ public class UTF8 implements WritableComparable<UTF8> {
|
|
// Only show the next 6 bytes max in the error code - in case the
|
|
// Only show the next 6 bytes max in the error code - in case the
|
|
// buffer is large, this will prevent an exceedingly large message.
|
|
// buffer is large, this will prevent an exceedingly large message.
|
|
int endForError = Math.min(i + 5, nBytes);
|
|
int endForError = Math.min(i + 5, nBytes);
|
|
- throw new IOException("Invalid UTF8 at " +
|
|
|
|
- StringUtils.byteToHexString(bytes, i - 1, endForError));
|
|
|
|
|
|
+ throw new UTFDataFormatException("Invalid UTF8 at " +
|
|
|
|
+ StringUtils.byteToHexString(bytes, i - 1, endForError));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|