|
@@ -134,6 +134,39 @@ public class LineRecordReader extends RecordReader<LongWritable, Text> {
|
|
|
return retVal;
|
|
|
}
|
|
|
|
|
|
+ private int skipUtfByteOrderMark() throws IOException {
|
|
|
+ // Strip BOM(Byte Order Mark)
|
|
|
+ // Text only support UTF-8, we only need to check UTF-8 BOM
|
|
|
+ // (0xEF,0xBB,0xBF) at the start of the text stream.
|
|
|
+ int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength,
|
|
|
+ Integer.MAX_VALUE);
|
|
|
+ int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
|
|
|
+ // Even we read 3 extra bytes for the first line,
|
|
|
+ // we won't alter existing behavior (no backwards incompat issue).
|
|
|
+ // Because the newSize is less than maxLineLength and
|
|
|
+ // the number of bytes copied to Text is always no more than newSize.
|
|
|
+ // If the return size from readLine is not less than maxLineLength,
|
|
|
+ // we will discard the current line and read the next line.
|
|
|
+ pos += newSize;
|
|
|
+ int textLength = value.getLength();
|
|
|
+ byte[] textBytes = value.getBytes();
|
|
|
+ if ((textLength >= 3) && (textBytes[0] == (byte)0xEF) &&
|
|
|
+ (textBytes[1] == (byte)0xBB) && (textBytes[2] == (byte)0xBF)) {
|
|
|
+ // find UTF-8 BOM, strip it.
|
|
|
+ LOG.info("Found UTF-8 BOM and skipped it");
|
|
|
+ textLength -= 3;
|
|
|
+ newSize -= 3;
|
|
|
+ if (textLength > 0) {
|
|
|
+ // It may work to use the same buffer and not do the copyBytes
|
|
|
+ textBytes = value.copyBytes();
|
|
|
+ value.set(textBytes, 3, textLength);
|
|
|
+ } else {
|
|
|
+ value.clear();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return newSize;
|
|
|
+ }
|
|
|
+
|
|
|
public boolean nextKeyValue() throws IOException {
|
|
|
if (key == null) {
|
|
|
key = new LongWritable();
|
|
@@ -146,9 +179,14 @@ public class LineRecordReader extends RecordReader<LongWritable, Text> {
|
|
|
// We always read one extra line, which lies outside the upper
|
|
|
// split limit i.e. (end - 1)
|
|
|
while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
|
|
|
- newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
|
|
|
- pos += newSize;
|
|
|
- if (newSize < maxLineLength) {
|
|
|
+ if (pos == 0) {
|
|
|
+ newSize = skipUtfByteOrderMark();
|
|
|
+ } else {
|
|
|
+ newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
|
|
|
+ pos += newSize;
|
|
|
+ }
|
|
|
+
|
|
|
+ if ((newSize == 0) || (newSize < maxLineLength)) {
|
|
|
break;
|
|
|
}
|
|
|
|