UTF8.java 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. /**
  2. * Copyright 2005 The Apache Software Foundation
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package org.apache.hadoop.io;
  17. import java.io.IOException;
  18. import java.io.DataInput;
  19. import java.io.DataOutput;
  20. import org.apache.commons.logging.*;
  21. /** A WritableComparable for strings that uses the UTF8 encoding.
  22. *
  23. * <p>Also includes utilities for efficiently reading and writing UTF-8.
  24. *
  25. * @author Doug Cutting
  26. */
  27. public class UTF8 implements WritableComparable {
  28. private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.UTF8");
  29. private static final DataOutputBuffer OBUF = new DataOutputBuffer();
  30. private static final DataInputBuffer IBUF = new DataInputBuffer();
  31. private static final byte[] EMPTY_BYTES = new byte[0];
  32. private byte[] bytes = EMPTY_BYTES;
  33. private int length;
  34. public UTF8() {
  35. //set("");
  36. }
  37. /** Construct from a given string. */
  38. public UTF8(String string) {
  39. set(string);
  40. }
  41. /** Construct from a given string. */
  42. public UTF8(UTF8 utf8) {
  43. set(utf8);
  44. }
  45. /** The raw bytes. */
  46. public byte[] getBytes() {
  47. return bytes;
  48. }
  49. /** The number of bytes in the encoded string. */
  50. public int getLength() {
  51. return length;
  52. }
  53. /** Set to contain the contents of a string. */
  54. public void set(String string) {
  55. if (string.length() > 0xffff/3) { // maybe too long
  56. LOG.warn("truncating long string: " + string.length()
  57. + " chars, starting with " + string.substring(0, 20));
  58. string = string.substring(0, 0xffff/3);
  59. }
  60. length = utf8Length(string); // compute length
  61. if (length > 0xffff) // double-check length
  62. throw new RuntimeException("string too long!");
  63. if (bytes == null || length > bytes.length) // grow buffer
  64. bytes = new byte[length];
  65. try { // avoid sync'd allocations
  66. synchronized (OBUF) {
  67. OBUF.reset();
  68. writeChars(OBUF, string, 0, string.length());
  69. System.arraycopy(OBUF.getData(), 0, bytes, 0, length);
  70. }
  71. } catch (IOException e) {
  72. throw new RuntimeException(e);
  73. }
  74. }
  75. /** Set to contain the contents of a string. */
  76. public void set(UTF8 other) {
  77. length = other.length;
  78. if (bytes == null || length > bytes.length) // grow buffer
  79. bytes = new byte[length];
  80. System.arraycopy(other.bytes, 0, bytes, 0, length);
  81. }
  82. public void readFields(DataInput in) throws IOException {
  83. length = in.readUnsignedShort();
  84. if (bytes == null || bytes.length < length)
  85. bytes = new byte[length];
  86. in.readFully(bytes, 0, length);
  87. }
  88. /** Skips over one UTF8 in the input. */
  89. public static void skip(DataInput in) throws IOException {
  90. int length = in.readUnsignedShort();
  91. in.skipBytes(length);
  92. }
  93. public void write(DataOutput out) throws IOException {
  94. out.writeShort(length);
  95. out.write(bytes, 0, length);
  96. }
  97. /** Compare two UTF8s. */
  98. public int compareTo(Object o) {
  99. UTF8 that = (UTF8)o;
  100. return WritableComparator.compareBytes(bytes, 0, length,
  101. that.bytes, 0, that.length);
  102. }
  103. /** Convert to a String. */
  104. public String toString() {
  105. StringBuffer buffer = new StringBuffer(length);
  106. try {
  107. synchronized (IBUF) {
  108. IBUF.reset(bytes, length);
  109. readChars(IBUF, buffer, length);
  110. }
  111. } catch (IOException e) {
  112. throw new RuntimeException(e);
  113. }
  114. return buffer.toString();
  115. }
  116. /** Returns true iff <code>o</code> is a UTF8 with the same contents. */
  117. public boolean equals(Object o) {
  118. if (!(o instanceof UTF8))
  119. return false;
  120. UTF8 that = (UTF8)o;
  121. if (this.length != that.length)
  122. return false;
  123. else
  124. return WritableComparator.compareBytes(bytes, 0, length,
  125. that.bytes, 0, that.length) == 0;
  126. }
  127. public int hashCode() {
  128. return WritableComparator.hashBytes(bytes, length);
  129. }
  130. /** A WritableComparator optimized for UTF8 keys. */
  131. public static class Comparator extends WritableComparator {
  132. public Comparator() {
  133. super(UTF8.class);
  134. }
  135. public int compare(byte[] b1, int s1, int l1,
  136. byte[] b2, int s2, int l2) {
  137. int n1 = readUnsignedShort(b1, s1);
  138. int n2 = readUnsignedShort(b2, s2);
  139. return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
  140. }
  141. }
  142. static { // register this comparator
  143. WritableComparator.define(UTF8.class, new Comparator());
  144. }
  145. /// STATIC UTILITIES FROM HERE DOWN
  146. /// These are probably not used much anymore, and might be removed...
  147. /** Convert a string to a UTF-8 encoded byte array.
  148. * @see String#getBytes(String)
  149. */
  150. public static byte[] getBytes(String string) {
  151. byte[] result = new byte[utf8Length(string)];
  152. try { // avoid sync'd allocations
  153. synchronized (OBUF) {
  154. OBUF.reset();
  155. writeChars(OBUF, string, 0, string.length());
  156. System.arraycopy(OBUF.getData(), 0, result, 0, OBUF.getLength());
  157. }
  158. } catch (IOException e) {
  159. throw new RuntimeException(e);
  160. }
  161. return result;
  162. }
  163. /** Read a UTF-8 encoded string.
  164. *
  165. * @see DataInput#readUTF()
  166. */
  167. public static String readString(DataInput in) throws IOException {
  168. int bytes = in.readUnsignedShort();
  169. StringBuffer buffer = new StringBuffer(bytes);
  170. readChars(in, buffer, bytes);
  171. return buffer.toString();
  172. }
  173. private static void readChars(DataInput in, StringBuffer buffer, int nBytes)
  174. throws IOException {
  175. synchronized (OBUF) {
  176. OBUF.reset();
  177. OBUF.write(in, nBytes);
  178. byte[] bytes = OBUF.getData();
  179. int i = 0;
  180. while (i < nBytes) {
  181. byte b = bytes[i++];
  182. if ((b & 0x80) == 0) {
  183. buffer.append((char)(b & 0x7F));
  184. } else if ((b & 0xE0) != 0xE0) {
  185. buffer.append((char)(((b & 0x1F) << 6)
  186. | (bytes[i++] & 0x3F)));
  187. } else {
  188. buffer.append((char)(((b & 0x0F) << 12)
  189. | ((bytes[i++] & 0x3F) << 6)
  190. | (bytes[i++] & 0x3F)));
  191. }
  192. }
  193. }
  194. }
  195. /** Write a UTF-8 encoded string.
  196. *
  197. * @see DataOutput#writeUTF(String)
  198. */
  199. public static int writeString(DataOutput out, String s) throws IOException {
  200. if (s.length() > 0xffff/3) { // maybe too long
  201. LOG.warn("truncating long string: " + s.length()
  202. + " chars, starting with " + s.substring(0, 20));
  203. s = s.substring(0, 0xffff/3);
  204. }
  205. int len = utf8Length(s);
  206. if (len > 0xffff) // double-check length
  207. throw new IOException("string too long!");
  208. out.writeShort(len);
  209. writeChars(out, s, 0, s.length());
  210. return len;
  211. }
  212. /** Returns the number of bytes required to write this. */
  213. private static int utf8Length(String string) {
  214. int stringLength = string.length();
  215. int utf8Length = 0;
  216. for (int i = 0; i < stringLength; i++) {
  217. int c = string.charAt(i);
  218. if ((c >= 0x0001) && (c <= 0x007F)) {
  219. utf8Length++;
  220. } else if (c > 0x07FF) {
  221. utf8Length += 3;
  222. } else {
  223. utf8Length += 2;
  224. }
  225. }
  226. return utf8Length;
  227. }
  228. private static void writeChars(DataOutput out,
  229. String s, int start, int length)
  230. throws IOException {
  231. final int end = start + length;
  232. for (int i = start; i < end; i++) {
  233. int code = s.charAt(i);
  234. if (code >= 0x01 && code <= 0x7F) {
  235. out.writeByte((byte)code);
  236. } else if (code <= 0x07FF) {
  237. out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
  238. out.writeByte((byte)(0x80 | code & 0x3F));
  239. } else {
  240. out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F)));
  241. out.writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
  242. out.writeByte((byte)(0x80 | (code & 0x3F)));
  243. }
  244. }
  245. }
  246. }