Bläddra i källkod

Fix for HADOOP-80. Make BytesWritable also a WritableComparable. Also add hashBytes() utility method to WritableComparator and use it to hash both BytesWritable and UTF8. Contributed by Owen O'Malley.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@386219 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 19 år sedan
förälder
incheckning
aeddf7427d

+ 131 - 14
src/java/org/apache/hadoop/io/BytesWritable.java

@@ -20,31 +20,148 @@ import java.io.IOException;
 import java.io.DataInput;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.DataOutput;
 
 
-/** A Writable for byte arrays.
- * 
+/** 
+ * A byte sequence that is usable as a key or value.
+ * It is resizable and distinguishes between the size of the seqeunce and
+ * the current capacity. The hash function is the front of the md5 of the 
+ * buffer. The sort order is the same as memcmp.
  * @author Doug Cutting
  * @author Doug Cutting
  */
  */
-public class BytesWritable implements Writable {
+public class BytesWritable implements WritableComparable {
+  private int size;
   private byte[] bytes;
   private byte[] bytes;
-
-  public BytesWritable() {}
-
+  
+  /**
+   * Create a zero-size sequence.
+   */
+  public BytesWritable() {
+    size = 0;
+    bytes = new byte[100];
+  }
+  
+  /**
+   * Create a BytesWritable using the byte array as the initial value.
+   * @param bytes This array becomes the backing storage for the object.
+   */
   public BytesWritable(byte[] bytes) {
   public BytesWritable(byte[] bytes) {
     this.bytes = bytes;
     this.bytes = bytes;
+    this.size = bytes.length;
   }
   }
-
+  
+  /**
+   * Get the data from the BytesWritable.
+   * @return The data is only valid between 0 and getSize() - 1.
+   */
   public byte[] get() {
   public byte[] get() {
     return bytes;
     return bytes;
   }
   }
-
+  
+  /**
+   * Get the current size of the buffer.
+   * @return
+   */
+  public int getSize() {
+    return size;
+  }
+  
+  /**
+   * Change the size of the buffer. The values in the old range are preserved
+   * and any new values are undefined. The capacity is changed if it is 
+   * necessary.
+   * @param size The new number of bytes
+   */
+  public void setSize(int size) {
+    if (size > getCapacity()) {
+      setCapacity(size * 3 / 2);
+    }
+    this.size = size;
+  }
+  
+  /**
+   * Get the capacity, which is the maximum size that could handled without
+   * resizing the backing storage.
+   * @return The number of bytes
+   */
+  public int getCapacity() {
+    return bytes.length;
+  }
+  
+  /**
+   * Change the capacity of the backing storage.
+   * The data is preserved.
+   * @param new_cap The new capacity in bytes.
+   */
+  public void setCapacity(int new_cap) {
+    if (new_cap != getCapacity()) {
+      byte[] new_data = new byte[new_cap];
+      if (new_cap < size) {
+        size = new_cap;
+      }
+      if (size != 0) {
+        System.arraycopy(bytes, 0, new_data, 0, size);
+      }
+      bytes = new_data;
+    }
+  }
+  
+  // inherit javadoc
   public void readFields(DataInput in) throws IOException {
   public void readFields(DataInput in) throws IOException {
-    bytes = new byte[in.readInt()];
-    in.readFully(bytes, 0, bytes.length);
+    setSize(0); // clear the old data
+    setSize(in.readInt());
+    in.readFully(bytes, 0, size);
   }
   }
-
+  
+  // inherit javadoc
   public void write(DataOutput out) throws IOException {
   public void write(DataOutput out) throws IOException {
-    out.writeInt(bytes.length);
-    out.write(bytes);
+    out.writeInt(size);
+    out.write(bytes, 0, size);
   }
   }
-
+  
+  public int hashCode() {
+    return WritableComparator.hashBytes(bytes, size);
+  }
+  
+  /**
+   * Define the sort order of the BytesWritable.
+   * @param right_obj The other bytes writable
+   * @return Positive if left is bigger than right, 0 if they are equal, and
+   *         negative if left is smaller than right.
+   */
+  public int compareTo(Object right_obj) {
+    BytesWritable right = ((BytesWritable) right_obj);
+    return WritableComparator.compareBytes(bytes, 0, size, 
+                                           right.bytes, 0, right.size);
+  }
+  
+  /**
+   * Are the two byte sequences equal?
+   */
+  public boolean equals(Object right_obj) {
+    if (right_obj instanceof BytesWritable) {
+      return compareTo(right_obj) == 0;
+    }
+    return false;
+  }
+  
+  /** A Comparator optimized for BytesWritable. */ 
+  public static class Comparator extends WritableComparator {
+    public Comparator() {
+      super(BytesWritable.class);
+    }
+    
+    /**
+     * Compare the buffers in serialized form.
+     */
+    public int compare(byte[] b1, int s1, int l1,
+        byte[] b2, int s2, int l2) {
+      int size1 = readInt(b1, s1);
+      int size2 = readInt(b2, s2);
+      return compareBytes(b1,s1+4, size1, b2, s2+4, size2);
+    }
+  }
+  
+  static {                                        // register this comparator
+    WritableComparator.define(BytesWritable.class, new Comparator());
+  }
+  
 }
 }

+ 1 - 4
src/java/org/apache/hadoop/io/UTF8.java

@@ -149,10 +149,7 @@ public class UTF8 implements WritableComparable {
   }
   }
 
 
   public int hashCode() {
   public int hashCode() {
-    int hash = 1;
-    for (int i = 0; i < length; i++)
-      hash = (31 * hash) + (int)bytes[i];
-    return hash;
+    return WritableComparator.hashBytes(bytes, length);
   }
   }
 
 
   /** A WritableComparator optimized for UTF8 keys. */
   /** A WritableComparator optimized for UTF8 keys. */

+ 8 - 0
src/java/org/apache/hadoop/io/WritableComparator.java

@@ -124,6 +124,14 @@ public class WritableComparator implements Comparator {
     return l1 - l2;
     return l1 - l2;
   }
   }
 
 
+  /** Compute hash for binary data. */
+  public static int hashBytes(byte[] bytes, int length) {
+    int hash = 1;
+    for (int i = 0; i < length; i++)
+      hash = (31 * hash) + (int)bytes[i];
+    return hash;
+  }
+
   /** Parse an unsigned short from a byte array. */
   /** Parse an unsigned short from a byte array. */
   public static int readUnsignedShort(byte[] bytes, int start) {
   public static int readUnsignedShort(byte[] bytes, int start) {
     return (((bytes[start]   & 0xff) <<  8) +
     return (((bytes[start]   & 0xff) <<  8) +

+ 68 - 0
src/test/org/apache/hadoop/io/TestBytesWritable.java

@@ -0,0 +1,68 @@
+package org.apache.hadoop.io;
+
+import junit.framework.TestCase;
+
+/**
+ * This is the unit test for BytesWritable.
+ * @author Owen O'Malley
+ */
+public class TestBytesWritable extends TestCase {
+
+  public void testSizeChange() throws Exception {
+    byte[] hadoop = "hadoop".getBytes();
+    BytesWritable buf = new BytesWritable(hadoop);
+    int size = buf.getSize();
+    int orig_capacity = buf.getCapacity();
+    buf.setSize(size*2);
+    int new_capacity = buf.getCapacity();
+    System.arraycopy(buf.get(),0, buf.get(), size, size);
+    assertTrue(new_capacity >= size * 2);
+    assertEquals(size * 2, buf.getSize());
+    assertTrue(new_capacity != orig_capacity);
+    buf.setSize(size*4);
+    assertTrue(new_capacity != buf.getCapacity());
+    for(int i=0; i < size*2; ++i) {
+      assertEquals(hadoop[i%size], buf.get()[i]);
+    }
+    // shrink the buffer
+    buf.setCapacity(1);
+    // make sure the size has been cut down too
+    assertEquals(1, buf.getSize());
+    // but that the data is still there
+    assertEquals(hadoop[0], buf.get()[0]);
+  }
+  
+  public void testHash() throws Exception {
+    byte[] owen = "owen".getBytes();
+    BytesWritable buf = new BytesWritable(owen);
+    assertEquals(4347922, buf.hashCode());
+    buf.setCapacity(10000);
+    assertEquals(4347922, buf.hashCode());
+    buf.setSize(0);
+    assertEquals(1, buf.hashCode());
+  }
+  
+  public void testCompare() throws Exception {
+    byte[][] values = new byte[][]{"abc".getBytes(), 
+        "ad".getBytes(),
+        "abcd".getBytes(),
+        "".getBytes(),
+        "b".getBytes()};
+    BytesWritable[] buf = new BytesWritable[values.length];
+    for(int i=0; i < values.length; ++i) {
+      buf[i] = new BytesWritable(values[i]);
+    }
+    // check to make sure the compare function is symetric and reflexive
+    for(int i=0; i < values.length; ++i) {
+      for(int j=0; j < values.length; ++j) {
+        assertTrue(buf[i].compareTo(buf[j]) == -buf[j].compareTo(buf[i]));
+        assertTrue((i == j) == (buf[i].compareTo(buf[j]) == 0));
+      }
+    }
+    assertTrue(buf[0].compareTo(buf[1]) < 0);
+    assertTrue(buf[1].compareTo(buf[2]) > 0);
+    assertTrue(buf[2].compareTo(buf[3]) > 0);
+    assertTrue(buf[3].compareTo(buf[4]) < 0);
+  }
+}
+