소스 검색

ZOOKEEPER-507. BookKeeper client re-write (Utkarsh and ben via mahadev)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/zookeeper/trunk@903483 13f79535-47bb-0310-9956-ffa450edef68
Mahadev Konar 15 년 전
부모
커밋
e1a1ee81f1
62개의 변경된 파일6345개의 추가작업 그리고 6711개의 파일을 삭제
  1. 2 0
      CHANGES.txt
  2. 214 95
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/Bookie.java
  3. 157 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/BufferedChannel.java
  4. 264 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/EntryLogger.java
  5. 113 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/FileInfo.java
  6. 454 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/LedgerCache.java
  7. 44 76
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/LedgerDescriptor.java
  8. 151 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/LedgerEntryPage.java
  9. 147 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/MarkerFileChannel.java
  10. 93 73
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/AsyncCallback.java
  11. 0 107
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BKDefs.java
  12. 105 41
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BKException.java
  13. 301 565
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BookKeeper.java
  14. 0 371
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BookieHandle.java
  15. 204 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BookieWatcher.java
  16. 50 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/CRC32DigestManager.java
  17. 0 138
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/ClientCBWorker.java
  18. 162 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/DigestManager.java
  19. 61 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/DistributionSchedule.java
  20. 163 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerCreateOp.java
  21. 46 26
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerEntry.java
  22. 399 795
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerHandle.java
  23. 0 1272
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerManagementProcessor.java
  24. 179 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerMetadata.java
  25. 136 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerOpenOp.java
  26. 0 245
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerRecoveryMonitor.java
  27. 167 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerRecoveryOp.java
  28. 0 65
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerSequence.java
  29. 67 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/MacDigestManager.java
  30. 137 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/PendingAddOp.java
  31. 145 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/PendingReadOp.java
  32. 0 299
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/QuorumEngine.java
  33. 0 472
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/QuorumOpMonitor.java
  34. 87 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/RoundRobinDistributionSchedule.java
  35. 85 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/SyncCounter.java
  36. 82 345
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/BookieClient.java
  37. 19 20
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/BookieProtocol.java
  38. 52 44
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/BookieServer.java
  39. 57 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/BookkeeperInternalCallbacks.java
  40. 28 38
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/NIOServerFactory.java
  41. 570 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java
  42. 0 35
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/ReadEntryCallback.java
  43. 51 36
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/ServerStats.java
  44. 0 32
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/WriteCallback.java
  45. 32 28
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/streaming/LedgerInputStream.java
  46. 32 33
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/streaming/LedgerOutputStream.java
  47. 2 8
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/LocalBookKeeper.java
  48. 6 7
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/Main.java
  49. 38 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/MathUtils.java
  50. 98 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/OrderedSafeExecutor.java
  51. 38 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/SafeRunnable.java
  52. 94 0
      src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/StringUtils.java
  53. 109 247
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/AsyncLedgerOpsTest.java
  54. 178 0
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/BaseTestCase.java
  55. 99 76
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/BookieClientTest.java
  56. 157 238
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/BookieFailureTest.java
  57. 181 296
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/BookieReadWriteTest.java
  58. 29 249
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/CloseTest.java
  59. 178 0
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/ConcurrentLedgerTest.java
  60. 36 288
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/LedgerRecoveryTest.java
  61. 43 48
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/LoopbackClient.java
  62. 3 3
      src/contrib/bookkeeper/test/org/apache/bookkeeper/test/NIOServerFactoryTest.java

+ 2 - 0
CHANGES.txt

@@ -266,6 +266,8 @@ IMPROVEMENTS:
   ZOOKEEPER-593.  java client api does not allow client to access negotiated
   session timeout (phunt via mahadev)
 
+  ZOOKEEPER-507. BookKeeper client re-write (Utkarsh and ben via mahadev)
+
 NEW FEATURES:
   ZOOKEEPER-539. generate eclipse project via ant target. (phunt via mahadev)
 

+ 214 - 95
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/Bookie.java

@@ -1,4 +1,3 @@
-package org.apache.bookkeeper.bookie;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,19 +19,25 @@ package org.apache.bookkeeper.bookie;
  * 
  */
 
+package org.apache.bookkeeper.bookie;
 
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.RandomAccessFile;
+import java.net.InetSocketAddress;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.LinkedList;
-import java.util.Random;
 import java.util.concurrent.LinkedBlockingQueue;
 
 import org.apache.bookkeeper.bookie.BookieException;
-import org.apache.bookkeeper.proto.WriteCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.WriteCallback;
 import org.apache.log4j.Logger;
 
 
@@ -45,10 +50,6 @@ import org.apache.log4j.Logger;
 public class Bookie extends Thread {
     HashMap<Long, LedgerDescriptor> ledgers = new HashMap<Long, LedgerDescriptor>();
     static Logger LOG = Logger.getLogger(Bookie.class);
-    /**
-     * 4 byte signature followed by 2-byte major and 2-byte minor versions
-     */
-    private static byte ledgerHeader[] =  { 0x42, 0x6f, 0x6f, 0x6b, 0, 0, 0, 0};
     
     final File journalDirectory;
 
@@ -69,6 +70,7 @@ public class Bookie extends Thread {
         private long ledgerId;
         private long entryId;
         public NoEntryException(long ledgerId, long entryId) {
+            super("Entry " + entryId + " not found in " + ledgerId);
             this.ledgerId = ledgerId;
             this.entryId = entryId;
         }
@@ -80,14 +82,124 @@ public class Bookie extends Thread {
         }
     }
 
-    public Bookie(File journalDirectory, File ledgerDirectories[]) {
+    EntryLogger entryLogger;
+    LedgerCache ledgerCache;
+    class SyncThread extends Thread {
+        volatile boolean running = true;
+        public SyncThread() {
+            super("SyncThread");
+        }
+        @Override
+        public void run() {
+            while(running) {
+                synchronized(this) {
+                    try {
+                        wait(100);
+                        if (!entryLogger.testAndClearSomethingWritten()) {
+                            continue;
+                        }
+                    } catch (InterruptedException e) {
+                        Thread.currentThread().interrupt();
+                        continue;
+                    }
+                }
+                lastLogMark.markLog();
+                try {
+                    ledgerCache.flushLedger(true);
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+                try {
+                    entryLogger.flush();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+                lastLogMark.rollLog();
+            }
+        }
+    }
+    SyncThread syncThread = new SyncThread();
+    public Bookie(File journalDirectory, File ledgerDirectories[]) throws IOException {
         this.journalDirectory = journalDirectory;
         this.ledgerDirectories = ledgerDirectories;
+        entryLogger = new EntryLogger(ledgerDirectories);
+        ledgerCache = new LedgerCache(ledgerDirectories);
+        lastLogMark.readLog();
+        final long markedLogId = lastLogMark.txnLogId;
+        if (markedLogId > 0) {
+            File logFiles[] = journalDirectory.listFiles();
+            ArrayList<Long> logs = new ArrayList<Long>();
+            for(File f: logFiles) {
+                String name = f.getName();
+                if (!name.endsWith(".txn")) {
+                    continue;
+                }
+                String idString = name.split("\\.")[0];
+                long id = Long.parseLong(idString, 16);
+                if (id < markedLogId) {
+                    continue;
+                }
+                logs.add(id);
+            }
+            Collections.sort(logs);
+            if (logs.size() == 0 || logs.get(0) != markedLogId) {
+                throw new IOException("Recovery log " + markedLogId + " is missing");
+            }
+            ByteBuffer lenBuff = ByteBuffer.allocate(4);
+            ByteBuffer recBuff = ByteBuffer.allocate(64*1024);
+            for(Long id: logs) {
+                FileChannel recLog = openChannel(id);
+                while(true) {
+                    lenBuff.clear();
+                    fullRead(recLog, lenBuff);
+                    if (lenBuff.remaining() != 0) {
+                        break;
+                    }
+                    lenBuff.flip();
+                    int len = lenBuff.getInt();
+                    if (len == 0) {
+                        break;
+                    }
+                    recBuff.clear();
+                    if (recBuff.remaining() < len) {
+                        recBuff = ByteBuffer.allocate(len);
+                    }
+                    recBuff.limit(len);
+                    if (fullRead(recLog, recBuff) != len) {
+                        // This seems scary, but it just means that this is where we
+                        // left off writing
+                        break;
+                    }
+                    recBuff.flip();
+                    long ledgerId = recBuff.getLong();
+                    // XXX we net to make sure we set the master keys appropriately!
+                    LedgerDescriptor handle = getHandle(ledgerId, false);
+                    try {
+                        recBuff.rewind();
+                        handle.addEntry(recBuff);
+                    } finally {
+                        putHandle(handle);
+                    }
+                }
+            }
+        }
         setDaemon(true);
         LOG.debug("I'm starting a bookie with journal directory " + journalDirectory.getName());
         start();
+        syncThread.start();
     }
 
+    private static int fullRead(FileChannel fc, ByteBuffer bb) throws IOException {
+        int total = 0;
+        while(bb.remaining() > 0) {
+            int rc = fc.read(bb);
+            if (rc <= 0) {
+                return total;
+            }
+            total += rc;
+        }
+        return total;
+    }
     private void putHandle(LedgerDescriptor handle) {
         synchronized (ledgers) {
             handle.decRef();
@@ -99,6 +211,9 @@ public class Bookie extends Thread {
         synchronized (ledgers) {
             handle = ledgers.get(ledgerId);
             if (handle == null) {
+                if (readonly) {
+                    throw new NoLedgerException(ledgerId);
+                }
                 handle = createHandle(ledgerId, readonly);
                 ledgers.put(ledgerId, handle);
                 handle.setMasterKey(ByteBuffer.wrap(masterKey));
@@ -113,6 +228,9 @@ public class Bookie extends Thread {
         synchronized (ledgers) {
             handle = ledgers.get(ledgerId);
             if (handle == null) {
+                if (readonly) {
+                    throw new NoLedgerException(ledgerId);
+                }
                 handle = createHandle(ledgerId, readonly);
                 ledgers.put(ledgerId, handle);
             } 
@@ -123,85 +241,9 @@ public class Bookie extends Thread {
     
 
     private LedgerDescriptor createHandle(long ledgerId, boolean readOnly) throws IOException {
-        RandomAccessFile ledgerFile = null;
-        RandomAccessFile ledgerIndexFile = null;
-        String ledgerName = getLedgerName(ledgerId, false);
-        String ledgerIndexName = getLedgerName(ledgerId, true);
-        for (File d : ledgerDirectories) {
-            File lf = new File(d, ledgerName);
-            File lif = new File(d, ledgerIndexName);
-            if (lf.exists()) {
-                if (ledgerFile != null) {
-                    throw new IOException("Duplicate ledger file found for "
-                            + ledgerId);
-                }
-                ledgerFile = new RandomAccessFile(lf, "rw");
-            }
-            if (lif.exists()) {
-                if (ledgerIndexFile != null) {
-                    throw new IOException(
-                            "Duplicate ledger index file found for " + ledgerId);
-                }
-                ledgerIndexFile = new RandomAccessFile(lif, "rw");
-            }
-        }
-        if (ledgerFile == null && ledgerIndexFile == null) {
-            if (readOnly) {
-                throw new NoLedgerException(ledgerId);
-            }
-            File dirs[] = pickDirs(ledgerDirectories);
-            File lf = new File(dirs[0], ledgerName);
-            checkParents(lf);
-            ledgerFile = new RandomAccessFile(lf, "rw");
-            ledgerFile.write(ledgerHeader);
-            File lif = new File(dirs[1], ledgerIndexName);
-            checkParents(lif);
-            ledgerIndexFile = new RandomAccessFile(lif, "rw");
-        }
-        if (ledgerFile != null && ledgerIndexFile != null) {
-            return new LedgerDescriptor(ledgerId, ledgerFile.getChannel(),
-                    ledgerIndexFile.getChannel());
-        }
-        if (ledgerFile == null) {
-            throw new IOException("Found index but no data for " + ledgerId);
-        }
-        throw new IOException("Found data but no index for " + ledgerId);
+        return new LedgerDescriptor(ledgerId, entryLogger, ledgerCache);
     }
     
-    static final private void checkParents(File f) throws IOException {
-        File parent = f.getParentFile();
-        if (parent.exists()) {
-            return;
-        }
-        if (parent.mkdirs() == false) {
-            throw new IOException("Counldn't mkdirs for " + parent);
-        }
-    }
-
-    static final private Random rand = new Random();
-
-    static final private File[] pickDirs(File dirs[]) {
-        File rc[] = new File[2];
-        rc[0] = dirs[rand.nextInt(dirs.length)];
-        rc[1] = dirs[rand.nextInt(dirs.length)];
-        return rc;
-    }
-
-    static final private String getLedgerName(long ledgerId, boolean isIndex) {
-        int parent = (int) (ledgerId & 0xff);
-        int grandParent = (int) ((ledgerId & 0xff00) >> 8);
-        StringBuilder sb = new StringBuilder();
-        sb.append(Integer.toHexString(grandParent));
-        sb.append('/');
-        sb.append(Integer.toHexString(parent));
-        sb.append('/');
-        sb.append(Long.toHexString(ledgerId));
-        if (isIndex) {
-            sb.append(".idx");
-        }
-        return sb.toString();
-    }
-
     static class QueueEntry {
         QueueEntry(ByteBuffer entry, long ledgerId, long entryId, 
                 WriteCallback cb, Object ctx) {
@@ -229,15 +271,76 @@ public class Bookie extends Thread {
     
     public final static ByteBuffer zeros = ByteBuffer.allocate(512);
     
+    class LastLogMark {
+        long txnLogId;
+        long txnLogPosition;
+        LastLogMark lastMark;
+        LastLogMark(long logId, long logPosition) {
+            this.txnLogId = logId;
+            this.txnLogPosition = logPosition;
+        }
+        synchronized void setLastLogMark(long logId, long logPosition) {
+            txnLogId = logId;
+            txnLogPosition = logPosition;
+        }
+        synchronized void markLog() {
+            lastMark = new LastLogMark(txnLogId, txnLogPosition);
+        }
+        synchronized void rollLog() {
+            byte buff[] = new byte[16];
+            ByteBuffer bb = ByteBuffer.wrap(buff);
+            bb.putLong(txnLogId);
+            bb.putLong(txnLogPosition);
+            for(File dir: ledgerDirectories) {
+                File file = new File(dir, "lastMark");
+                try {
+                    FileOutputStream fos = new FileOutputStream(file);
+                    fos.write(buff);
+                    fos.getChannel().force(true);
+                    fos.close();
+                } catch (IOException e) {
+                    LOG.error("Problems writing to " + file, e);
+                }
+            }
+        }
+        synchronized void readLog() {
+            byte buff[] = new byte[16];
+            ByteBuffer bb = ByteBuffer.wrap(buff);
+            for(File dir: ledgerDirectories) {
+                File file = new File(dir, "lastMark");
+                try {
+                    FileInputStream fis = new FileInputStream(file);
+                    fis.read(buff);
+                    fis.close();
+                    bb.clear();
+                    long i = bb.getLong();
+                    long p = bb.getLong();
+                    if (i > txnLogId) {
+                        txnLogId = i;
+                    }
+                    if (p > txnLogPosition) {
+                        txnLogPosition = p;
+                    }
+                } catch (IOException e) {
+                    LOG.error("Problems reading from " + file + " (this is okay if it is the first time starting this bookie");
+                }
+            }
+        }
+    }
+    
+    private LastLogMark lastLogMark = new LastLogMark(0, 0);
+    
+    @Override
     public void run() {
         LinkedList<QueueEntry> toFlush = new LinkedList<QueueEntry>();
         ByteBuffer lenBuff = ByteBuffer.allocate(4);
         try {
-            FileChannel logFile = new RandomAccessFile(new File(journalDirectory,
-                    Long.toHexString(System.currentTimeMillis()) + ".txn"),
-                    "rw").getChannel();
+            long logId = System.currentTimeMillis();
+            FileChannel logFile = openChannel(logId);
+            BufferedChannel bc = new BufferedChannel(logFile, 65536);
             zeros.clear();
             long nextPrealloc = preAllocSize;
+            long lastFlushPosition = 0;
             logFile.write(zeros, nextPrealloc);
             while (true) {
                 QueueEntry qe = null;
@@ -245,10 +348,13 @@ public class Bookie extends Thread {
                     qe = queue.take();
                 } else {
                     qe = queue.poll();
-                    if (qe == null || toFlush.size() > 100) {
-                        logFile.force(false);
+                    if (qe == null || bc.position() > lastFlushPosition + 512*1024) {
+                        //logFile.force(false);
+                        bc.flush(true);
+                        lastFlushPosition = bc.position();
+                        lastLogMark.setLastLogMark(logId, lastFlushPosition);
                         for (QueueEntry e : toFlush) {
-                            e.cb.writeComplete(0, e.ledgerId, e.entryId, e.ctx);
+                            e.cb.writeComplete(0, e.ledgerId, e.entryId, null, e.ctx);
                         }
                         toFlush.clear();
                     }
@@ -259,8 +365,13 @@ public class Bookie extends Thread {
                 lenBuff.clear();
                 lenBuff.putInt(qe.entry.remaining());
                 lenBuff.flip();
-                logFile.write(new ByteBuffer[] { lenBuff, qe.entry });
-                if (logFile.position() > nextPrealloc) {
+                //
+                // we should be doing the following, but then we run out of
+                // direct byte buffers
+                // logFile.write(new ByteBuffer[] { lenBuff, qe.entry });
+                bc.write(lenBuff);
+                bc.write(qe.entry);
+                if (bc.position() > nextPrealloc) {
                     nextPrealloc = (logFile.size() / preAllocSize + 1) * preAllocSize;
                     zeros.clear();
                     logFile.write(zeros, nextPrealloc);
@@ -272,9 +383,18 @@ public class Bookie extends Thread {
         }
     }
 
+    private FileChannel openChannel(long logId) throws FileNotFoundException {
+        FileChannel logFile = new RandomAccessFile(new File(journalDirectory,
+                Long.toHexString(logId) + ".txn"),
+                "rw").getChannel();
+        return logFile;
+    }
+
     public void shutdown() throws InterruptedException {
         this.interrupt();
         this.join();
+        syncThread.running = false;
+        syncThread.join();
         for(LedgerDescriptor d: ledgers.values()) {
             d.close();
         }
@@ -282,7 +402,6 @@ public class Bookie extends Thread {
     
     public void addEntry(ByteBuffer entry, WriteCallback cb, Object ctx, byte[] masterKey)
             throws IOException, BookieException {
-        
         long ledgerId = entry.getLong();
         LedgerDescriptor handle = getHandle(ledgerId, false, masterKey);
         
@@ -318,7 +437,7 @@ public class Bookie extends Thread {
     static class CounterCallback implements WriteCallback {
         int count;
 
-        synchronized public void writeComplete(int rc, long l, long e, Object ctx) {
+        synchronized public void writeComplete(int rc, long l, long e, InetSocketAddress addr, Object ctx) {
             count--;
             if (count == 0) {
                 notifyAll();

+ 157 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/BufferedChannel.java

@@ -0,0 +1,157 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.bookkeeper.bookie;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+
+/**
+ * Provides a buffering layer in front of a FileChannel.
+ */
+public class BufferedChannel 
+{
+    ByteBuffer writeBuffer;
+    ByteBuffer readBuffer;
+    private FileChannel bc;
+    long position;
+    int capacity;
+    long readBufferStartPosition;
+    long writeBufferStartPosition;
+    BufferedChannel(FileChannel bc, int capacity) throws IOException {
+        this.bc = bc;
+        this.capacity = capacity;
+        position = bc.position();
+        writeBufferStartPosition = position;
+    }
+/*    public void close() throws IOException {
+        bc.close();
+    }
+*/
+//    public boolean isOpen() {
+//        return bc.isOpen();
+//    }
+
+    synchronized public int write(ByteBuffer src) throws IOException {
+        int copied = 0;
+        if (writeBuffer == null) {
+            writeBuffer = ByteBuffer.allocateDirect(capacity);
+        }
+        while(src.remaining() > 0) {
+            int truncated = 0;
+            if (writeBuffer.remaining() < src.remaining()) {
+                truncated = src.remaining() - writeBuffer.remaining();
+                src.limit(src.limit()-truncated);
+            }
+            copied += src.remaining();
+            writeBuffer.put(src);
+            src.limit(src.limit()+truncated);
+            if (writeBuffer.remaining() == 0) {
+                writeBuffer.flip();
+                bc.write(writeBuffer);
+                writeBuffer.clear();
+                writeBufferStartPosition = bc.position();
+            }
+        }
+        position += copied;
+        return copied;
+    }
+    
+    public long position() {
+        return position;
+    }
+    
+    public void flush(boolean sync) throws IOException {
+        synchronized(this) {
+            if (writeBuffer == null) {
+                return;
+            }
+            writeBuffer.flip();
+            bc.write(writeBuffer);
+            writeBuffer.clear();
+            writeBufferStartPosition = bc.position();
+        }
+        if (sync) {
+            bc.force(false);
+        }
+    }
+
+    /*public Channel getInternalChannel() {
+        return bc;
+    }*/
+    synchronized public int read(ByteBuffer buff, long pos) throws IOException {
+        if (readBuffer == null) {
+            readBuffer = ByteBuffer.allocateDirect(capacity);
+            readBufferStartPosition = Long.MIN_VALUE;
+        }
+        int rc = buff.remaining();
+        while(buff.remaining() > 0) {
+            // check if it is in the write buffer    
+            if (writeBuffer != null && writeBufferStartPosition <= pos) {
+                long positionInBuffer = pos - writeBufferStartPosition;
+                long bytesToCopy = writeBuffer.position()-positionInBuffer;
+                if (bytesToCopy > buff.remaining()) {
+                    bytesToCopy = buff.remaining();
+                }
+                if (bytesToCopy == 0) {
+                    throw new IOException("Read past EOF");
+                }
+                ByteBuffer src = writeBuffer.duplicate();
+                src.position((int) positionInBuffer);
+                src.limit((int) (positionInBuffer+bytesToCopy));
+                buff.put(src);
+                pos+= bytesToCopy;
+                // first check if there is anything we can grab from the readBuffer
+            } else if (readBufferStartPosition <= pos && pos < readBufferStartPosition+readBuffer.capacity()) {
+                long positionInBuffer = pos - readBufferStartPosition;
+                long bytesToCopy = readBuffer.capacity()-positionInBuffer;
+                if (bytesToCopy > buff.remaining()) {
+                    bytesToCopy = buff.remaining();
+                }
+                ByteBuffer src = readBuffer.duplicate();
+                src.position((int) positionInBuffer);
+                src.limit((int) (positionInBuffer+bytesToCopy));
+                buff.put(src);
+                pos += bytesToCopy;
+            // let's read it
+            } else {
+                readBufferStartPosition = pos;
+                readBuffer.clear();
+                // make sure that we don't overlap with the write buffer
+                if (readBufferStartPosition + readBuffer.capacity() >= writeBufferStartPosition) {
+                    readBufferStartPosition = writeBufferStartPosition - readBuffer.capacity();
+                    if (readBufferStartPosition < 0) {
+                        readBuffer.put(LedgerEntryPage.zeroPage, 0, (int)-readBufferStartPosition);
+                    }
+                }
+                while(readBuffer.remaining() > 0) {
+                    if (bc.read(readBuffer, readBufferStartPosition+readBuffer.position()) <= 0) {
+                        throw new IOException("Short read");
+                    }
+                }
+                readBuffer.put(LedgerEntryPage.zeroPage, 0, readBuffer.remaining());
+                readBuffer.clear();
+            }
+        }
+        return rc;
+    }
+}

+ 264 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/EntryLogger.java

@@ -0,0 +1,264 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.bookkeeper.bookie;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This class manages the writing of the bookkeeper entries. All the new
+ * entries are written to a common log. The LedgerCache will have pointers
+ * into files created by this class with offsets into the files to find
+ * the actual ledger entry. The entry log files created by this class are
+ * identified by a long.
+ */
+public class EntryLogger {
+    private static final Logger LOG = Logger.getLogger(EntryLogger.class);
+    private File dirs[];
+    private long logId;
+    /**
+     * The maximum size of a entry logger file.
+     */
+    final static long LOG_SIZE_LIMIT = 2*1024*1024*1024L;
+    private volatile BufferedChannel logChannel;
+    // The ledgers contained in this file, seems to be unsused right now
+    //private HashSet<Long> ledgerMembers = new HashSet<Long>();
+    /**
+     * The 1K block at the head of the entry logger file
+     * that contains the fingerprint and (future) meta-data
+     */
+    final static ByteBuffer LOGFILE_HEADER = ByteBuffer.allocate(1024);
+    static {
+        LOGFILE_HEADER.put("BKLO".getBytes());
+    }
+    // this indicates that a write has happened since the last flush
+    private volatile boolean somethingWritten = false;
+    
+    /**
+     * Create an EntryLogger that stores it's log files in the given
+     * directories
+     */
+    public EntryLogger(File dirs[]) throws IOException {
+        this.dirs = dirs;
+        // Find the largest logId
+        for(File f: dirs) {
+            long lastLogId = getLastLogId(f);
+            if (lastLogId >= logId) {
+                logId = lastLogId+1;
+            }
+        }
+        createLogId(logId);
+        //syncThread = new SyncThread();
+        //syncThread.start();
+    }
+    
+    /**
+     * Maps entry log files to open channels.
+     */
+    private ConcurrentHashMap<Long, BufferedChannel> channels = new ConcurrentHashMap<Long, BufferedChannel>();
+    
+    /**
+     * Creates a new log file with the given id.
+     */
+    private void createLogId(long logId) throws IOException {
+        List<File> list = Arrays.asList(dirs);
+        Collections.shuffle(list);
+        File firstDir = list.get(0);
+        if (logChannel != null) {
+            logChannel.flush(true);
+        }
+        logChannel = new BufferedChannel(new RandomAccessFile(new File(firstDir, Long.toHexString(logId)+".log"), "rw").getChannel(), 64*1024);
+        logChannel.write((ByteBuffer) LOGFILE_HEADER.clear());
+        channels.put(logId, logChannel);
+        for(File f: dirs) {
+            setLastLogId(f, logId);
+        }
+    }
+
+    /**
+     * writes the given id to the "lastId" file in the given directory.
+     */
+    private void setLastLogId(File dir, long logId) throws IOException {
+        FileOutputStream fos;
+        fos = new FileOutputStream(new File(dir, "lastId"));
+        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos));
+        try {
+            bw.write(Long.toHexString(logId) + "\n");
+            bw.flush();
+        } finally {
+            try {
+                fos.close();
+            } catch (IOException e) {
+            }
+        }
+    }
+    
+    /**
+     * reads id from the "lastId" file in the given directory.
+     */
+    private long getLastLogId(File f) {
+        FileInputStream fis;
+        try {
+            fis = new FileInputStream(new File(f, "lastId"));
+        } catch (FileNotFoundException e) {
+            return -1;
+        }
+        BufferedReader br = new BufferedReader(new InputStreamReader(fis));
+        try {
+            String lastIdString = br.readLine();
+            return Long.parseLong(lastIdString);
+        } catch (IOException e) {
+            return -1;
+        } catch(NumberFormatException e) {
+            return -1;
+        } finally {
+            try {
+                fis.close();
+            } catch (IOException e) {
+            }
+        }
+    }
+    
+    private void openNewChannel() throws IOException {
+        createLogId(++logId);
+    }
+    
+    synchronized void flush() throws IOException {
+        if (logChannel != null) {
+            logChannel.flush(true);
+        }
+    }
+    synchronized long addEntry(long ledger, ByteBuffer entry) throws IOException {
+        if (logChannel.position() + entry.remaining() + 4 > LOG_SIZE_LIMIT) {
+            openNewChannel();
+        }
+        ByteBuffer buff = ByteBuffer.allocate(4);
+        buff.putInt(entry.remaining());
+        buff.flip();
+        logChannel.write(buff);
+        long pos = logChannel.position();
+        logChannel.write(entry);
+        //logChannel.flush(false);
+        somethingWritten = true;
+        return (logId << 32L) | pos;
+    }
+    
+    byte[] readEntry(long ledgerId, long entryId, long location) throws IOException {
+        long entryLogId = location >> 32L;
+        long pos = location & 0xffffffffL;
+        ByteBuffer sizeBuff = ByteBuffer.allocate(4);
+        pos -= 4; // we want to get the ledgerId and length to check
+        BufferedChannel fc;
+        try {
+            fc = getChannelForLogId(entryLogId);
+        } catch (FileNotFoundException e) {
+            FileNotFoundException newe = new FileNotFoundException(e.getMessage() + " for " + ledgerId + " with location " + location);
+            newe.setStackTrace(e.getStackTrace());
+            throw newe;
+        }
+        if (fc.read(sizeBuff, pos) != sizeBuff.capacity()) {
+            throw new IOException("Short read from entrylog " + entryLogId);
+        }
+        pos += 4;
+        sizeBuff.flip();
+        int entrySize = sizeBuff.getInt();
+        // entrySize does not include the ledgerId
+        if (entrySize > 1024*1024) {
+            LOG.error("Sanity check failed for entry size of " + entrySize + " at location " + pos + " in " + entryLogId);
+            
+        }
+        byte data[] = new byte[entrySize];
+        ByteBuffer buff = ByteBuffer.wrap(data);
+        int rc = fc.read(buff, pos);
+        if ( rc != data.length) {
+            throw new IOException("Short read for " + ledgerId + "@" + entryId + " in " + entryLogId + "@" + pos + "("+rc+"!="+data.length+")");
+        }
+        buff.flip();
+        long thisLedgerId = buff.getLong();
+        if (thisLedgerId != ledgerId) {
+            throw new IOException("problem found in " + entryLogId + "@" + entryId + " at position + " + pos + " entry belongs to " + thisLedgerId + " not " + ledgerId);
+        }
+        long thisEntryId = buff.getLong();
+        if (thisEntryId != entryId) {
+            throw new IOException("problem found in " + entryLogId + "@" + entryId + " at position + " + pos + " entry is " + thisEntryId + " not " + entryId);
+        }
+        
+        return data;
+    }
+    
+    private BufferedChannel getChannelForLogId(long entryLogId) throws IOException {
+        BufferedChannel fc = channels.get(entryLogId);
+        if (fc != null) {
+            return fc;
+        }
+        File file = findFile(entryLogId);
+        FileChannel newFc = new RandomAccessFile(file, "rw").getChannel();
+        synchronized (channels) {
+            fc = channels.get(entryLogId);
+            if (fc != null){
+                newFc.close();
+                return fc;
+            }
+            fc = new BufferedChannel(newFc, 8192);
+            channels.put(entryLogId, fc);
+            return fc;
+        }
+    }
+
+    private File findFile(long logId) throws FileNotFoundException {
+        for(File d: dirs) {
+            File f = new File(d, Long.toHexString(logId)+".log");
+            if (f.exists()) {
+                return f;
+            }
+        }
+        throw new FileNotFoundException("No file for log " + Long.toHexString(logId));
+    }
+    
+    public void close() {
+    }
+
+    synchronized public boolean testAndClearSomethingWritten() {
+        try {
+            return somethingWritten;
+        } finally {
+            somethingWritten = false;
+        }
+    }
+
+}

+ 113 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/FileInfo.java

@@ -0,0 +1,113 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.bookkeeper.bookie;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+
+/**
+ * This is the file handle for a ledger's index file that maps entry ids to location.
+ * It is used by LedgerCache.
+ */
+class FileInfo {
+    private FileChannel fc;
+    /**
+     * The fingerprint of a ledger index file
+     */
+    private byte header[] = "BKLE\0\0\0\0".getBytes();
+    static final long START_OF_DATA = 1024;
+    private long size;
+    private int useCount;
+    private boolean isClosed;
+    public FileInfo(File lf) throws IOException {
+        fc = new RandomAccessFile(lf, "rws").getChannel();
+        size = fc.size();
+        if (size == 0) {
+            fc.write(ByteBuffer.wrap(header));
+        }
+    }
+
+    synchronized public long size() {
+        long rc = size-START_OF_DATA;
+        if (rc < 0) {
+            rc = 0;
+        }
+        return rc;
+    }
+
+    synchronized public int read(ByteBuffer bb, long position) throws IOException {
+        int total = 0;
+        while(bb.remaining() > 0) {
+            int rc = fc.read(bb, position+START_OF_DATA);
+            if (rc <= 0) {
+                throw new IOException("Short read");
+            }
+            total += rc;
+        }
+        return total;
+    }
+
+    synchronized public void close() throws IOException {
+        isClosed = true;
+        if (useCount == 0) {
+            fc.close();
+        }
+    }
+
+    synchronized public long write(ByteBuffer[] buffs, long position) throws IOException {
+        long total = 0;
+        try {
+            fc.position(position+START_OF_DATA);
+            while(buffs[buffs.length-1].remaining() > 0) {
+                long rc = fc.write(buffs);
+                if (rc <= 0) {
+                    throw new IOException("Short write");
+                }
+                total += rc;
+            }
+        } finally {
+            long newsize = position+START_OF_DATA+total;
+            if (newsize > size) {
+                size = newsize;
+            }
+        }
+        return total;
+    }
+
+    synchronized public void use() {
+        useCount++;
+    }
+    
+    synchronized public void release() {
+        useCount--;
+        if (isClosed && useCount == 0) {
+            try {
+                fc.close();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+    }
+}

+ 454 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/LedgerCache.java

@@ -0,0 +1,454 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.bookkeeper.bookie;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This class maps a ledger entry number into a location (entrylogid, offset) in
+ * an entry log file. It does user level caching to more efficiently manage disk
+ * head scheduling.
+ */
+public class LedgerCache {
+    private final static Logger LOG = Logger.getLogger(LedgerDescriptor.class);
+    
+    final File ledgerDirectories[];
+
+    public LedgerCache(File ledgerDirectories[]) {
+        this.ledgerDirectories = ledgerDirectories;
+    }
+    /**
+     * the list of potentially clean ledgers
+     */
+    LinkedList<Long> cleanLedgers = new LinkedList<Long>();
+    
+    /**
+     * the list of potentially dirty ledgers
+     */
+    LinkedList<Long> dirtyLedgers = new LinkedList<Long>();
+    
+    HashMap<Long, FileInfo> fileInfoCache = new HashMap<Long, FileInfo>();
+    
+    LinkedList<Long> openLedgers = new LinkedList<Long>();
+    
+    static int OPEN_FILE_LIMIT = 900;
+    static {
+        if (System.getProperty("openFileLimit") != null) {
+            OPEN_FILE_LIMIT = Integer.parseInt(System.getProperty("openFileLimit"));
+        }
+        LOG.info("openFileLimit is " + OPEN_FILE_LIMIT);
+    }
+    
+    // allocate half of the memory to the page cache
+    private static int pageLimit = (int)(Runtime.getRuntime().maxMemory() / 3) / LedgerEntryPage.PAGE_SIZE;
+    static {
+        LOG.info("maxMemory = " + Runtime.getRuntime().maxMemory());
+        if (System.getProperty("pageLimit") != null) {
+            pageLimit = Integer.parseInt(System.getProperty("pageLimit"));
+        }
+        LOG.info("pageLimit is " + pageLimit);
+    }
+    // The number of pages that have actually been used
+    private int pageCount;
+    HashMap<Long, HashMap<Long,LedgerEntryPage>> pages = new HashMap<Long, HashMap<Long,LedgerEntryPage>>();
+    
+    private void putIntoTable(HashMap<Long, HashMap<Long,LedgerEntryPage>> table, LedgerEntryPage lep) {
+        HashMap<Long, LedgerEntryPage> map = table.get(lep.getLedger());
+        if (map == null) {
+            map = new HashMap<Long, LedgerEntryPage>();
+            table.put(lep.getLedger(), map);
+        }
+        map.put(lep.getFirstEntry(), lep);
+    }
+    
+    private static LedgerEntryPage getFromTable(HashMap<Long, HashMap<Long,LedgerEntryPage>> table, Long ledger, Long firstEntry) {
+        HashMap<Long, LedgerEntryPage> map = table.get(ledger);
+        if (map != null) {
+            return map.get(firstEntry);
+        }
+        return null;
+    }
+    
+   synchronized private LedgerEntryPage getLedgerEntryPage(Long ledger, Long firstEntry, boolean onlyDirty) {
+        LedgerEntryPage lep = getFromTable(pages, ledger, firstEntry);
+        try {
+            if (onlyDirty && lep.isClean()) {
+                return null;
+            }
+            return lep;
+        } finally {
+            if (lep != null) {
+                lep.usePage();
+            }
+        }
+    }
+
+   public void putEntryOffset(long ledger, long entry, long offset) throws IOException {
+        int offsetInPage = (int) (entry%LedgerEntryPage.ENTRIES_PER_PAGES);
+        // find the id of the first entry of the page that has the entry
+        // we are looking for
+        long pageEntry = entry-offsetInPage;
+        LedgerEntryPage lep = getLedgerEntryPage(ledger, pageEntry, false);
+        if (lep == null) {
+            // find a free page
+            lep = grabCleanPage(ledger, pageEntry);
+            updatePage(lep);
+            synchronized(this) {
+                putIntoTable(pages, lep);
+            }
+        }
+        if (lep != null) {
+            lep.setOffset(offset, offsetInPage*8);
+            lep.releasePage();
+            return;
+        }
+    }
+    
+    public long getEntryOffset(long ledger, long entry) throws IOException {
+        int offsetInPage = (int) (entry%LedgerEntryPage.ENTRIES_PER_PAGES);
+        // find the id of the first entry of the page that has the entry
+        // we are looking for
+        long pageEntry = entry-offsetInPage;
+        LedgerEntryPage lep = getLedgerEntryPage(ledger, pageEntry, false);
+        try {
+            if (lep == null) {
+                lep = grabCleanPage(ledger, pageEntry);
+                synchronized(this) {
+                    putIntoTable(pages, lep);
+                }
+                updatePage(lep);
+                
+            }
+            return lep.getOffset(offsetInPage*8);
+        } finally {
+            if (lep != null) {
+                lep.releasePage();
+            }
+        }
+    }
+    
+    static final private String getLedgerName(long ledgerId) {
+        int parent = (int) (ledgerId & 0xff);
+        int grandParent = (int) ((ledgerId & 0xff00) >> 8);
+        StringBuilder sb = new StringBuilder();
+        sb.append(Integer.toHexString(grandParent));
+        sb.append('/');
+        sb.append(Integer.toHexString(parent));
+        sb.append('/');
+        sb.append(Long.toHexString(ledgerId));
+        sb.append(".idx");
+        return sb.toString();
+    }
+    
+    static final private void checkParents(File f) throws IOException {
+        File parent = f.getParentFile();
+        if (parent.exists()) {
+            return;
+        }
+        if (parent.mkdirs() == false) {
+            throw new IOException("Counldn't mkdirs for " + parent);
+        }
+    }
+    
+    static final private Random rand = new Random();
+
+    static final private File pickDirs(File dirs[]) {
+        return dirs[rand.nextInt(dirs.length)];
+    }
+
+    FileInfo getFileInfo(Long ledger, boolean create) throws IOException {
+        synchronized(fileInfoCache) {
+            FileInfo fi = fileInfoCache.get(ledger);
+            if (fi == null) {
+                String ledgerName = getLedgerName(ledger);
+                File lf = null;
+                for(File d: ledgerDirectories) {
+                    lf = new File(d, ledgerName);
+                    if (lf.exists()) {
+                        break;
+                    }
+                    lf = null;
+                }
+                if (lf == null) {
+                    if (!create) {
+                        throw new Bookie.NoLedgerException(ledger);
+                    }
+                    File dir = pickDirs(ledgerDirectories);
+                    lf = new File(dir, ledgerName);
+                    checkParents(lf);
+                }
+                if (openLedgers.size() > OPEN_FILE_LIMIT) {
+                    fileInfoCache.remove(openLedgers.removeFirst()).close();
+                }
+                fi = new FileInfo(lf);
+                fileInfoCache.put(ledger, fi);
+                openLedgers.add(ledger);
+            }
+            if (fi != null) {
+                fi.use();
+            }
+            return fi;
+        }
+    }
+    private void updatePage(LedgerEntryPage lep) throws IOException {
+        if (!lep.isClean()) {
+            throw new IOException("Trying to update a dirty page");
+        }
+        FileInfo fi = null;
+        try {
+            fi = getFileInfo(lep.getLedger(), true);
+            long pos = lep.getFirstEntry()*8;
+            if (pos >= fi.size()) {
+                lep.zeroPage();
+            } else {
+                lep.readPage(fi);
+            }
+        } finally {
+            if (fi != null) {
+                fi.release();
+            }
+        }
+    }
+
+    void flushLedger(boolean doAll) throws IOException {
+        synchronized(dirtyLedgers) {
+            if (dirtyLedgers.isEmpty()) {
+                synchronized(this) {
+                    for(Long l: pages.keySet()) {
+                        if (LOG.isTraceEnabled()) {
+                            LOG.trace("Adding " + Long.toHexString(l) + " to dirty pages");
+                        }
+                        dirtyLedgers.add(l);
+                    }
+                }
+            }
+            if (dirtyLedgers.isEmpty()) {
+                return;
+            }
+            while(!dirtyLedgers.isEmpty()) {
+                Long l = dirtyLedgers.removeFirst();
+                LinkedList<Long> firstEntryList;
+                synchronized(this) {
+                    HashMap<Long, LedgerEntryPage> pageMap = pages.get(l);
+                    if (pageMap == null || pageMap.isEmpty()) {
+                        continue;
+                    }
+                    firstEntryList = new LinkedList<Long>();
+                    for(Map.Entry<Long, LedgerEntryPage> entry: pageMap.entrySet()) {
+                        LedgerEntryPage lep = entry.getValue();
+                        if (lep.isClean()) {
+                            if (LOG.isTraceEnabled()) {
+                                LOG.trace("Page is clean " + lep);
+                            }
+                            continue;
+                        }
+                        firstEntryList.add(lep.getFirstEntry());
+                    }
+                }
+                // Now flush all the pages of a ledger
+                List<LedgerEntryPage> entries = new ArrayList<LedgerEntryPage>(firstEntryList.size());
+                FileInfo fi = null;
+                try {
+                    for(Long firstEntry: firstEntryList) {
+                        LedgerEntryPage lep = getLedgerEntryPage(l, firstEntry, true);
+                        if (lep != null) {
+                            entries.add(lep);
+                        }
+                    }
+                    Collections.sort(entries, new Comparator<LedgerEntryPage>() {
+                        @Override
+                        public int compare(LedgerEntryPage o1, LedgerEntryPage o2) {
+                            return (int)(o1.getFirstEntry()-o2.getFirstEntry());
+                        }});
+                    ArrayList<Integer> versions = new ArrayList<Integer>(entries.size());
+                    fi = getFileInfo(l, true);
+                    int start = 0;
+                    long lastOffset = -1;
+                    for(int i = 0; i < entries.size(); i++) {
+                        versions.add(i, entries.get(i).getVersion());
+                        if (lastOffset != -1 && (entries.get(i).getFirstEntry() - lastOffset) != LedgerEntryPage.ENTRIES_PER_PAGES) {
+                            // send up a sequential list
+                            int count = i - start;
+                            if (count == 0) {
+                                System.out.println("Count cannot possibly be zero!");
+                            }
+                            writeBuffers(l, entries, fi, start, count);
+                            start = i;
+                        }
+                        lastOffset = entries.get(i).getFirstEntry();
+                    }
+                    if (entries.size()-start == 0 && entries.size() != 0) {
+                        System.out.println("Nothing to write, but there were entries!");
+                    }
+                    writeBuffers(l, entries, fi, start, entries.size()-start);
+                    synchronized(this) {
+                        for(int i = 0; i < entries.size(); i++) {
+                            LedgerEntryPage lep = entries.get(i);
+                            lep.setClean(versions.get(i));
+                        }
+                    }
+                } finally {
+                    for(LedgerEntryPage lep: entries) {
+                        lep.releasePage();
+                    }
+                    if (fi != null) {
+                        fi.release();
+                    }
+                }
+                if (!doAll) {
+                    break;
+                }
+                // Yeild. if we are doing all the ledgers we don't want to block other flushes that
+                // need to happen
+                try {
+                    dirtyLedgers.wait(1);
+                } catch (InterruptedException e) {
+                    // just pass it on
+                    Thread.currentThread().interrupt();
+                }
+            }
+        }
+    }
+    
+    private void writeBuffers(Long ledger,
+            List<LedgerEntryPage> entries, FileInfo fi,
+            int start, int count) throws IOException {
+        if (LOG.isTraceEnabled()) {
+            LOG.trace("Writing " + count + " buffers of " + Long.toHexString(ledger));
+        }
+        if (count == 0) {
+            //System.out.println("Count is zero!");
+            return;
+        }
+        ByteBuffer buffs[] = new ByteBuffer[count];
+        for(int j = 0; j < count; j++) {
+            buffs[j] = entries.get(start+j).getPageToWrite();
+            if (entries.get(start+j).getLedger() != ledger) {
+                throw new IOException("Writing to " + ledger + " but page belongs to " + entries.get(start+j).getLedger());
+            }
+        }
+        long totalWritten = 0;
+        while(buffs[buffs.length-1].remaining() > 0) {
+            long rc = fi.write(buffs, entries.get(start+0).getFirstEntry()*8);
+            if (rc <= 0) {
+                throw new IOException("Short write to ledger " + ledger + " rc = " + rc);
+            }
+            //System.out.println("Wrote " + rc + " to " + ledger);
+            totalWritten += rc;
+        }
+        if (totalWritten != count*LedgerEntryPage.PAGE_SIZE) {
+            throw new IOException("Short write to ledger " + ledger + " wrote " + totalWritten + " expected " + count*LedgerEntryPage.PAGE_SIZE);
+        }
+    }
+    private LedgerEntryPage grabCleanPage(long ledger, long entry) throws IOException {
+        if (entry % LedgerEntryPage.ENTRIES_PER_PAGES != 0) {
+            throw new IllegalArgumentException(entry + " is not a multiple of " + LedgerEntryPage.ENTRIES_PER_PAGES);
+        }
+        synchronized(this) {
+            if (pageCount  < pageLimit) {
+                // let's see if we can allocate something
+                LedgerEntryPage lep = new LedgerEntryPage();
+                lep.setLedger(ledger);
+                lep.setFirstEntry(entry);
+                // note, this will not block since it is a new page
+                lep.usePage();
+                pageCount++;
+                return lep;
+            }
+        }
+        
+        outerLoop:
+        while(true) {
+            synchronized(cleanLedgers) {
+                if (cleanLedgers.isEmpty()) {
+                    flushLedger(false);
+                    synchronized(this) {
+                        for(Long l: pages.keySet()) {
+                            cleanLedgers.add(l);
+                        }
+                    }
+                }
+                synchronized(this) {
+                    Long cleanLedger = cleanLedgers.getFirst();
+                    Map<Long, LedgerEntryPage> map = pages.get(cleanLedger);
+                    if (map == null || map.isEmpty()) {
+                        cleanLedgers.removeFirst();
+                        continue;
+                    }
+                    Iterator<Map.Entry<Long, LedgerEntryPage>> it = map.entrySet().iterator();
+                    LedgerEntryPage lep = it.next().getValue();
+                    while((lep.inUse() || !lep.isClean())) {
+                        if (it.hasNext()) {
+                            continue outerLoop;
+                        }
+                        lep = it.next().getValue();
+                    }
+                    it.remove();
+                    if (map.isEmpty()) {
+                        pages.remove(lep.getLedger());
+                    }
+                    lep.usePage();
+                    lep.zeroPage();
+                    lep.setLedger(ledger);
+                    lep.setFirstEntry(entry);
+                    return lep;
+                }
+            }
+        }
+    }
+
+    public long getLastEntry(long ledgerId) {
+        long lastEntry = 0;
+        // Find the last entry in the cache
+        synchronized(this) {
+            Map<Long, LedgerEntryPage> map = pages.get(ledgerId);
+            if (map != null) {
+                for(LedgerEntryPage lep: map.values()) {
+                    if (lep.getFirstEntry() + LedgerEntryPage.ENTRIES_PER_PAGES < lastEntry) {
+                        continue;
+                    }
+                    lep.usePage();
+                    long highest = lep.getLastEntry();
+                    if (highest > lastEntry) {
+                        lastEntry = highest;
+                    }
+                    lep.releasePage();
+                }
+            }
+        }
+        
+        return lastEntry;
+    }
+}

+ 44 - 76
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/LedgerDescriptor.java

@@ -1,4 +1,3 @@
-package org.apache.bookkeeper.bookie;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,11 +19,10 @@ package org.apache.bookkeeper.bookie;
  * 
  */
 
+package org.apache.bookkeeper.bookie;
 
 import java.io.IOException;
-import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
 
 import org.apache.log4j.Logger;
 
@@ -36,11 +34,12 @@ import org.apache.log4j.Logger;
  *
  */
 public class LedgerDescriptor {
-    Logger LOG = Logger.getLogger(LedgerDescriptor.class);
-    LedgerDescriptor(long ledgerId, FileChannel ledger, FileChannel ledgerIndex) {
+    final static Logger LOG = Logger.getLogger(LedgerDescriptor.class);
+    LedgerCache ledgerCache;
+    LedgerDescriptor(long ledgerId, EntryLogger entryLogger, LedgerCache ledgerCache) {
         this.ledgerId = ledgerId;
-        this.ledger = ledger;
-        this.ledgerIndex = ledgerIndex;
+        this.entryLogger = entryLogger;
+        this.ledgerCache = ledgerCache;
     }
     
     private ByteBuffer masterKey = null;
@@ -54,8 +53,7 @@ public class LedgerDescriptor {
     }
     
     private long ledgerId;
-    private FileChannel ledger;
-    private FileChannel ledgerIndex;
+    EntryLogger entryLogger;
     private int refCnt;
     synchronized public void incRef() {
         refCnt++;
@@ -66,100 +64,70 @@ public class LedgerDescriptor {
     synchronized public int getRefCnt() {
         return refCnt;
     }
-    static private final long calcEntryOffset(long entryId) {
-        return 8L*entryId;
-    }
     long addEntry(ByteBuffer entry) throws IOException {
-        ByteBuffer offsetBuffer = ByteBuffer.wrap(new byte[8]);
         long ledgerId = entry.getLong();
         if (ledgerId != this.ledgerId) {
             throw new IOException("Entry for ledger " + ledgerId + " was sent to " + this.ledgerId);
         }
-        /*
-         * Get entry id
-         */
-                
         long entryId = entry.getLong();
         entry.rewind();
         
         /*
-         * Set offset of entry id to be the current ledger position
+         * Log the entry
          */
-        offsetBuffer.rewind();
-        offsetBuffer.putLong(ledger.position());
-        //LOG.debug("Offset: " + ledger.position() + ", " + entry.position() + ", " + calcEntryOffset(entryId) + ", " + entryId);
-        offsetBuffer.flip();
-        
-        /*
-         * Write on the index entry corresponding to entryId the position
-         * of this entry.
-         */
-        ledgerIndex.write(offsetBuffer, calcEntryOffset(entryId));
-        ByteBuffer lenBuffer = ByteBuffer.allocate(4);
+        long pos = entryLogger.addEntry(ledgerId, entry);
         
         
-        lenBuffer.putInt(entry.remaining());
-        lenBuffer.flip();
-        
         /*
-         * Write length of entry first, then the entry itself
+         * Set offset of entry id to be the current ledger position
          */
-        ledger.write(lenBuffer);
-        ledger.write(entry);
-        //entry.position(24);
-        //LOG.debug("Entry: " + entry.position() + ", " + new String(entry.array()));
-     
+        ledgerCache.putEntryOffset(ledgerId, entryId, pos);
         return entryId;
     }
     ByteBuffer readEntry(long entryId) throws IOException {
-        ByteBuffer buffer = ByteBuffer.wrap(new byte[8]);
         long offset;
         /*
          * If entryId is -1, then return the last written.
          */
         if (entryId == -1) {
-            offset = ledgerIndex.size()-8; 
-        } else {
-            offset = calcEntryOffset(entryId);
+            long lastEntry = ledgerCache.getLastEntry(ledgerId);
+            FileInfo fi = null;
+            try {
+                fi = ledgerCache.getFileInfo(ledgerId, false);
+                long size = fi.size();
+                // we may not have the last entry in the cache
+                if (size > lastEntry*8) {
+                    ByteBuffer bb = ByteBuffer.allocate(LedgerEntryPage.PAGE_SIZE);
+                    long position = size-LedgerEntryPage.PAGE_SIZE;
+                    if (position < 0) {
+                        position = 0;
+                    }
+                    fi.read(bb, position);
+                    bb.flip();
+                    long startingEntryId = position/8;
+                    for(int i = LedgerEntryPage.ENTRIES_PER_PAGES-1; i >= 0; i--) {
+                        if (bb.getLong(i*8) != 0) {
+                            if (lastEntry < startingEntryId+i) {
+                                lastEntry = startingEntryId+i;
+                            }
+                            break;
+                        }
+                    }
+                }
+            } finally {
+                if (fi != null) {
+                    fi.release();
+                }
+            }
+            entryId = lastEntry;
         }
-        int len = ledgerIndex.read(buffer, offset);
-        buffer.flip();
-        if (len != buffer.limit()) {
-            throw new Bookie.NoEntryException(ledgerId, entryId);
-        }
-        offset = buffer.getLong();
+        
+        offset = ledgerCache.getEntryOffset(ledgerId, entryId);
         if (offset == 0) {
             throw new Bookie.NoEntryException(ledgerId, entryId);
         }
-        LOG.debug("Offset: " + offset);
-
-        buffer.limit(4);
-        buffer.rewind();
-        /*
-         * Read the length
-         */
-        ledger.read(buffer, offset);
-        buffer.flip();
-        len = buffer.getInt();
-        LOG.debug("Length of buffer: " + len);
-        buffer = ByteBuffer.allocate(len);
-        /*
-         * Read the rest. We add 4 to skip the length
-         */
-        ledger.read(buffer, offset + 4);
-        buffer.flip();
-        return buffer;
+        return ByteBuffer.wrap(entryLogger.readEntry(ledgerId, entryId, offset));
     }
     void close() {
-        try {
-            ledger.close();
-        } catch (IOException e) {
-            LOG.warn("Error closing ledger " + ledgerId, e);
-        }
-        try {
-            ledgerIndex.close();
-        } catch (IOException e) {
-            LOG.warn("Error closing index for ledger " + ledgerId, e);
-        }
     }
 }

+ 151 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/LedgerEntryPage.java

@@ -0,0 +1,151 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.bookkeeper.bookie;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * This is a page in the LedgerCache. It holds the locations
+ * (entrylogfile, offset) for entry ids.
+ */
+public class LedgerEntryPage {
+    public static final int PAGE_SIZE = 8192;
+    public static final int ENTRIES_PER_PAGES = PAGE_SIZE/8;
+    private long ledger = -1;
+    private long firstEntry = -1;
+    private ByteBuffer page = ByteBuffer.allocateDirect(PAGE_SIZE);
+    private boolean clean = true;
+    private boolean pinned = false;
+    private int useCount;
+    private int version;
+    
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append(getLedger());
+        sb.append('@');
+        sb.append(getFirstEntry());
+        sb.append(clean ? " clean " : " dirty ");
+        sb.append(useCount);
+        return sb.toString();
+    }
+    synchronized public void usePage() {
+        useCount++;
+    }
+    synchronized public void pin() {
+        pinned = true;
+    }
+    synchronized public void unpin() {
+        pinned = false;
+    }
+    synchronized public boolean isPinned() {
+        return pinned;
+    }
+    synchronized public void releasePage() {
+        useCount--;
+        if (useCount < 0) {
+            throw new IllegalStateException("Use count has gone below 0");
+        }
+    }
+    synchronized private void checkPage() {
+        if (useCount <= 0) {
+            throw new IllegalStateException("Page not marked in use");
+        }
+    }
+    @Override
+    public boolean equals(Object other) {
+        LedgerEntryPage otherLEP = (LedgerEntryPage) other;
+        return otherLEP.getLedger() == getLedger() && otherLEP.getFirstEntry() == getFirstEntry();
+    }
+    @Override
+    public int hashCode() {
+        return (int)getLedger() ^ (int)(getFirstEntry());
+    }
+    void setClean(int versionOfCleaning) {
+        this.clean = (versionOfCleaning == version);
+    }
+    boolean isClean() {
+        return clean;
+    }
+    public void setOffset(long offset, int position) {
+        checkPage();
+        version++;
+        this.clean = false;
+        page.putLong(position, offset);
+    }
+    public long getOffset(int position) {
+        checkPage();
+        return page.getLong(position);
+    }
+    static final byte zeroPage[] = new byte[64*1024];
+    public void zeroPage() {
+        checkPage();
+        page.clear();
+        page.put(zeroPage, 0, page.remaining());
+        clean = true;
+    }
+    public void readPage(FileInfo fi) throws IOException {
+        checkPage();
+        page.clear();
+        while(page.remaining() != 0) {
+            if (fi.read(page, getFirstEntry()*8) <= 0) {
+                throw new IOException("Short page read of ledger " + getLedger() + " tried to get " + page.capacity() + " from position " + getFirstEntry()*8 + " still need " + page.remaining());
+            }
+        }
+        clean = true;
+    }
+    public ByteBuffer getPageToWrite() {
+        checkPage();
+        page.clear();
+        return page;
+    }
+    void setLedger(long ledger) {
+        this.ledger = ledger;
+    }
+    long getLedger() {
+        return ledger;
+    }
+    int getVersion() {
+        return version;
+    }
+    void setFirstEntry(long firstEntry) {
+        if (firstEntry % ENTRIES_PER_PAGES != 0) {
+            throw new IllegalArgumentException(firstEntry + " is not a multiple of " + ENTRIES_PER_PAGES);
+        }
+        this.firstEntry = firstEntry;
+    }
+    long getFirstEntry() {
+        return firstEntry;
+    }
+    public boolean inUse() {
+        return useCount > 0;
+    }
+    public long getLastEntry() {
+        for(int i = ENTRIES_PER_PAGES - 1; i >= 0; i--) {
+            if (getOffset(i*8) > 0) {
+                return i + firstEntry;
+            }
+        }
+        return 0;
+    }
+}

+ 147 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/bookie/MarkerFileChannel.java

@@ -0,0 +1,147 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.bookkeeper.bookie;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.FileLock;
+import java.nio.channels.ReadableByteChannel;
+import java.nio.channels.WritableByteChannel;
+
+/**
+ * This class is just a stub that can be used in collections with
+ * FileChannels
+ */
+public class MarkerFileChannel extends FileChannel {
+
+    @Override
+    public void force(boolean metaData) throws IOException {
+        // TODO Auto-generated method stub
+
+    }
+
+    @Override
+    public FileLock lock(long position, long size, boolean shared)
+            throws IOException {
+        // TODO Auto-generated method stub
+        return null;
+    }
+
+    @Override
+    public MappedByteBuffer map(MapMode mode, long position, long size)
+            throws IOException {
+        // TODO Auto-generated method stub
+        return null;
+    }
+
+    @Override
+    public long position() throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    public FileChannel position(long newPosition) throws IOException {
+        // TODO Auto-generated method stub
+        return null;
+    }
+
+    @Override
+    public int read(ByteBuffer dst) throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    public int read(ByteBuffer dst, long position) throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    public long read(ByteBuffer[] dsts, int offset, int length)
+            throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    public long size() throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    public long transferFrom(ReadableByteChannel src, long position, long count)
+            throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    public long transferTo(long position, long count, WritableByteChannel target)
+            throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    public FileChannel truncate(long size) throws IOException {
+        // TODO Auto-generated method stub
+        return null;
+    }
+
+    @Override
+    public FileLock tryLock(long position, long size, boolean shared)
+            throws IOException {
+        // TODO Auto-generated method stub
+        return null;
+    }
+
+    @Override
+    public int write(ByteBuffer src) throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    public int write(ByteBuffer src, long position) throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    public long write(ByteBuffer[] srcs, int offset, int length)
+            throws IOException {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    @Override
+    protected void implCloseChannel() throws IOException {
+        // TODO Auto-generated method stub
+
+    }
+
+}

+ 93 - 73
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/AsyncCallback.java

@@ -1,81 +1,101 @@
 package org.apache.bookkeeper.client;
 
+import java.util.Enumeration;
+
 /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
  * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
  */
 
 public interface AsyncCallback {
-    public interface AddCallback {
-        /**
-         * Callback declaration
-         * 
-         * @param rc    return code
-         * @param ledgerId  ledger identifier
-         * @param entryId   entry identifier
-         * @param ctx   control object
-         */
-        void addComplete(int rc, LedgerHandle lh, long entryId, Object ctx);
-    }
-    
-    public interface CloseCallback {
-        /**
-         * Callback definition
-         * 
-         * @param rc    return code
-         * @param ledgerId  ledger identifier
-         * @param ctx   control object
-         */
-        void closeComplete(int rc, LedgerHandle lh, Object ctx);
-    }
-    
-    public interface CreateCallback {
-        /**
-         * Declaration of callback method
-         * 
-         * @param rc    return status
-         * @param lh    ledger handle
-         * @param ctx   control object
-         */
-        
-        void createComplete(int rc, LedgerHandle lh, Object ctx);
-    }
-    
-    public interface OpenCallback {
-        /**
-         * Callback for asynchronous call to open ledger
-         * 
-         * @param rc
-         * @param lh
-         * @param ctx
-         */
-        
-        public void openComplete(int rc, LedgerHandle lh, Object ctx);
-        
-    }
-    
-    public interface ReadCallback {
-        /**
-         * Callback declaration
-         * 
-         * @param rc    return code
-         * @param ledgerId  ledger identifier
-         * @param seq   sequence of entries
-         * @param ctx   control object
-         */
-        void readComplete(int rc, LedgerHandle lh, LedgerSequence seq, Object ctx);
-    }
+  public interface AddCallback {
+    /**
+     * Callback declaration
+     * 
+     * @param rc
+     *          return code
+     * @param ledgerId
+     *          ledger identifier
+     * @param entryId
+     *          entry identifier
+     * @param ctx
+     *          control object
+     */
+    void addComplete(int rc, LedgerHandle lh, long entryId, Object ctx);
+  }
+
+  public interface CloseCallback {
+    /**
+     * Callback definition
+     * 
+     * @param rc
+     *          return code
+     * @param ledgerId
+     *          ledger identifier
+     * @param ctx
+     *          control object
+     */
+    void closeComplete(int rc, LedgerHandle lh, Object ctx);
+  }
+
+  public interface CreateCallback {
+    /**
+     * Declaration of callback method
+     * 
+     * @param rc
+     *          return status
+     * @param lh
+     *          ledger handle
+     * @param ctx
+     *          control object
+     */
+
+    void createComplete(int rc, LedgerHandle lh, Object ctx);
+  }
+
+  public interface OpenCallback {
+    /**
+     * Callback for asynchronous call to open ledger
+     * 
+     * @param rc
+     *          Return code
+     * @param lh
+     *          ledger handle
+     * @param ctx
+     *          control object
+     */
+
+    public void openComplete(int rc, LedgerHandle lh, Object ctx);
+
+  }
+
+  public interface ReadCallback {
+    /**
+     * Callback declaration
+     * 
+     * @param rc
+     *          return code
+     * @param ledgerId
+     *          ledger identifier
+     * @param seq
+     *          sequence of entries
+     * @param ctx
+     *          control object
+     */
+
+    void readComplete(int rc, LedgerHandle lh, Enumeration<LedgerEntry> seq,
+        Object ctx);
+  }
 }

+ 0 - 107
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BKDefs.java

@@ -1,107 +0,0 @@
-package org.apache.bookkeeper.client;
-
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-public interface BKDefs { 
-    /**
-     * String used to construct znode paths. They are used in BookKeeper
-     *  and LedgerManagementProcessor.
-     */
-    
-    /*
-     * Path to ledger metadata. ZooKeeper appends a sequence number to L.
-     */
-    static public final String prefix = "/ledgers/L";
-    
-    /*
-     * Parent node to store ensemble composition. Each child corresponds to
-     * one bookie.
-     */
-    static public final String ensemble = "/ensemble"; 
-    
-    /*
-     * Quorum size.
-     */
-    static public final String quorumSize = "/quorum";
-    
-    /*
-     * Close node.
-     */
-    static public final String close = "/close";
-    
-    /*
-     * Quorum mode: VERIFYING or GENERIC
-     */
-    static public final String quorumMode = "/mode";
-    
-    /*
-     * Marks failure points in during writes to the ledger.
-     */
-    static public final String quorumEvolution = "/quorum_evolution";
-    
-    /*
-     * Ledger is in write mode
-     */
-    
-    static public final int WRITE = 0;
-
-    /*
-     * Ledger is in read mode
-     */
-
-    static public final int READ = 1;
-    
-    /**
-     * Status ok
-     */
-    public final int EOK = 0;
-    
-    /**
-     * Insufficient bookies
-     */
-    public final int EIB = -1;
- 
-    /**
-     * No such a ledger
-     */
-    public final int ENL = -2;
-    
-    /**
-     * Error while recovering ledger
-     */
-    public final int ERL = -3;
-    
-    /**
-     * Error while reading from zookeeper or writing to zookeeper
-     */
-    public final int EZK = -4;
-
-    /**
-     * IO error, typically when trying to connect to a bookie
-     */
-    public final int EIO = -5;
-    
-    /**
-     * Exceeded number of retries
-     */
-    public final int ENR = -6;
-}

+ 105 - 41
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BKException.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.client;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,11 +21,10 @@ package org.apache.bookkeeper.client;
  * 
  */
 
-
 import java.lang.Exception;
 
 /**
- * Implements BookKeeper exceptions. 
+ * Class the enumerates all the possible error conditions
  * 
  */
 
@@ -32,12 +32,18 @@ import java.lang.Exception;
 public abstract class BKException extends Exception {
 
     private int code;
-    public BKException(int code){
+
+    BKException(int code) {
         this.code = code;
     }
-    
-    public static BKException create(int code){
-        switch(code){
+
+    /**
+     * Create an exception from an error code
+     * @param code return error code
+     * @return correponding exception
+     */
+    public static BKException create(int code) {
+        switch (code) {
         case Code.ReadException:
             return new BKReadException();
         case Code.QuorumException:
@@ -54,11 +60,25 @@ public abstract class BKException extends Exception {
             return new BKNoSuchLedgerExistsException();
         case Code.BookieHandleNotAvailableException:
             return new BKBookieHandleNotAvailableException();
+        case Code.ZKException:
+            return new ZKException();
+        case Code.LedgerRecoveryException:
+            return new BKLedgerRecoveryException();
+        case Code.LedgerClosedException:
+            return new BKLedgerClosedException();
+        case Code.WriteException:
+            return new BKWriteException();
+        case Code.NoSuchEntryException:
+            return new BKNoSuchEntryException();
         default:
             return new BKIllegalOpException();
         }
     }
-    
+
+    /**
+     * List of return codes
+     *
+     */
     public interface Code {
         int OK = 0;
         int ReadException = -1;
@@ -69,20 +89,25 @@ public abstract class BKException extends Exception {
         int NotEnoughBookiesException = -6;
         int NoSuchLedgerExistsException = -7;
         int BookieHandleNotAvailableException = -8;
-        
+        int ZKException = -9;
+        int LedgerRecoveryException = -10;
+        int LedgerClosedException = -11;
+        int WriteException = -12;
+        int NoSuchEntryException = -13;
+
         int IllegalOpException = -100;
     }
-    
-    public void setCode(int code){
+
+    public void setCode(int code) {
         this.code = code;
     }
-    
-    public int getCode(){
+
+    public int getCode() {
         return this.code;
     }
-    
-    public String getMessage(int code){
-        switch(code){
+
+    public static String getMessage(int code) {
+        switch (code) {
         case Code.OK:
             return "No problem";
         case Code.ReadException:
@@ -101,63 +126,102 @@ public abstract class BKException extends Exception {
             return "No such ledger exists";
         case Code.BookieHandleNotAvailableException:
             return "Bookie handle is not available";
+        case Code.ZKException:
+            return "Error while using ZooKeeper";
+        case Code.LedgerRecoveryException:
+            return "Error while recovering ledger";
+        case Code.LedgerClosedException:
+            return "Attempt to write to a closed ledger";
+        case Code.WriteException:
+            return "Write failed on bookie";
+        case Code.NoSuchEntryException:
+            return "No such entry";
         default:
             return "Invalid operation";
         }
     }
-    
+
     public static class BKReadException extends BKException {
-        public BKReadException(){
+        public BKReadException() {
             super(Code.ReadException);
-        }   
+        }
     }
-    
+
+    public static class BKNoSuchEntryException extends BKException {
+        public BKNoSuchEntryException() {
+            super(Code.NoSuchEntryException);
+        }
+    }
+
     public static class BKQuorumException extends BKException {
-        public BKQuorumException(){
+        public BKQuorumException() {
             super(Code.QuorumException);
-        }   
+        }
     }
-     
+
     public static class BKBookieException extends BKException {
-        public BKBookieException(){
+        public BKBookieException() {
             super(Code.NoBookieAvailableException);
-        }   
+        }
     }
-    
+
     public static class BKDigestNotInitializedException extends BKException {
-        public BKDigestNotInitializedException(){
+        public BKDigestNotInitializedException() {
             super(Code.DigestNotInitializedException);
-        }   
+        }
     }
-    
+
     public static class BKDigestMatchException extends BKException {
-        public BKDigestMatchException(){
+        public BKDigestMatchException() {
             super(Code.DigestMatchException);
-        }   
+        }
     }
-    
+
     public static class BKIllegalOpException extends BKException {
-        public BKIllegalOpException(){
+        public BKIllegalOpException() {
             super(Code.IllegalOpException);
-        }   
+        }
     }
-    
+
     public static class BKNotEnoughBookiesException extends BKException {
-        public BKNotEnoughBookiesException(){
+        public BKNotEnoughBookiesException() {
             super(Code.NotEnoughBookiesException);
         }
     }
 
+    public static class BKWriteException extends BKException {
+        public BKWriteException() {
+            super(Code.WriteException);
+        }
+    }
+
     public static class BKNoSuchLedgerExistsException extends BKException {
-        public BKNoSuchLedgerExistsException(){
+        public BKNoSuchLedgerExistsException() {
             super(Code.NoSuchLedgerExistsException);
-        }   
+        }
     }
-    
+
     public static class BKBookieHandleNotAvailableException extends BKException {
-        public BKBookieHandleNotAvailableException(){
+        public BKBookieHandleNotAvailableException() {
             super(Code.BookieHandleNotAvailableException);
-        }   
+        }
+    }
+
+    public static class ZKException extends BKException {
+        public ZKException() {
+            super(Code.ZKException);
+        }
+    }
+
+    public static class BKLedgerRecoveryException extends BKException {
+        public BKLedgerRecoveryException() {
+            super(Code.LedgerRecoveryException);
+        }
+    }
+
+    public static class BKLedgerClosedException extends BKException {
+        public BKLedgerClosedException() {
+            super(Code.LedgerClosedException);
+        }
     }
 }
-    

+ 301 - 565
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BookKeeper.java

@@ -1,4 +1,5 @@
-package org. apache.bookkeeper.client;
+package org.apache.bookkeeper.client;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,597 +21,332 @@ package org. apache.bookkeeper.client;
  * 
  */
 
-
 import java.io.IOException;
-import java.net.ConnectException;
-import java.nio.ByteBuffer;
-import java.nio.channels.UnresolvedAddressException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.HashMap;
-import java.util.Random;
-import java.net.InetSocketAddress;
-
+import java.util.concurrent.Executors;
 import org.apache.bookkeeper.client.BKException;
-import org.apache.bookkeeper.client.BookieHandle;
-import org.apache.bookkeeper.client.LedgerSequence;
 import org.apache.bookkeeper.client.AsyncCallback.CreateCallback;
 import org.apache.bookkeeper.client.AsyncCallback.OpenCallback;
 import org.apache.bookkeeper.client.BKException.Code;
-import org.apache.bookkeeper.client.LedgerHandle.QMode;
-import org.apache.bookkeeper.client.LedgerManagementProcessor.CreateLedgerOp;
-import org.apache.bookkeeper.client.LedgerManagementProcessor.OpenLedgerOp;
+import org.apache.bookkeeper.client.SyncCounter;
+import org.apache.bookkeeper.proto.BookieClient;
+import org.apache.bookkeeper.util.OrderedSafeExecutor;
 import org.apache.log4j.Logger;
 
-import org.apache.zookeeper.data.Stat;
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.ZooKeeper;
 import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.CreateMode;
-import org.apache.zookeeper.ZooDefs.Ids;
 import org.apache.zookeeper.WatchedEvent;
-
+import org.jboss.netty.channel.socket.ClientSocketChannelFactory;
+import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory;
 
 /**
- * BookKeeper client. We assume there is one single writer 
- * to a ledger at any time. 
+ * BookKeeper client. We assume there is one single writer to a ledger at any
+ * time.
+ * 
+ * There are three possible operations: start a new ledger, write to a ledger,
+ * and read from a ledger.
  * 
- * There are three possible operations: start a new ledger, 
- * write to a ledger, and read from a ledger.
+ * The exceptions resulting from synchronous calls and error code resulting from
+ * asynchronous calls can be found in the class {@link BKException}.
  * 
- * For the ZooKeeper layout, please refer to BKDefs.java.
  * 
  */
 
-public class BookKeeper 
-implements Watcher {
- 
-    Logger LOG = Logger.getLogger(BookKeeper.class);
-    
-    ZooKeeper zk = null;
-    
-    /*
-     * The ledgerMngProcessor is a thread that processes
-     * asynchronously requests that handle ledgers, such
-     * as create, open, and close.
-     */
-    private static LedgerManagementProcessor ledgerMngProcessor;
-    
-    /*
-     * Blacklist of bookies
-     */
-    HashSet<InetSocketAddress> bookieBlackList;
-    
-    LedgerSequence responseRead;
-    Long responseLong;
-    
-    public BookKeeper(String servers) 
-    throws KeeperException, IOException{
-    	LOG.debug("Creating BookKeeper for servers " + servers);
-        //Create ZooKeeper object
-        this.zk = new ZooKeeper(servers, 10000, this);
-        
-        //List to enable clients to blacklist bookies
-        this.bookieBlackList = new HashSet<InetSocketAddress>();
-    }
-    
-    /**
-     * Watcher method. 
-     */
-    synchronized public void process(WatchedEvent event) {
-        LOG.debug("Process: " + event.getType() + " " + event.getPath());
-    }
-    
-    /**
-     * Formats ledger ID according to ZooKeeper rules
-     * 
-     * @param id	znode id
-     */
-    String getZKStringId(long id){
-        return String.format("%010d", id);        
-    }
-    
-    /**
-     * return the zookeeper instance
-     * @return return the zookeeper instance
-     */
-    ZooKeeper getZooKeeper() {
-        return zk;
-    }
-    
-    LedgerManagementProcessor getMngProcessor(){
-        if (ledgerMngProcessor == null){
-            ledgerMngProcessor = new LedgerManagementProcessor(this);
-            ledgerMngProcessor.start();
-        }
-        return ledgerMngProcessor;
-    }
-    
-    /**
-     * Creates a new ledger. To create a ledger, we need to specify the ensemble
-     * size, the quorum size, the operation mode, and a password. The ensemble size
-     * and the quorum size depend upon the operation mode. The operation mode can be
-     * GENERIC, VERIFIABLE, or FREEFORM (debugging). The password is used not only
-     * to authenticate access to a ledger, but also to verify entries in verifiable
-     * ledgers.
-     * 
-     * @param ensSize   ensemble size
-     * @param qSize     quorum size
-     * @param mode      quorum mode: VERIFIABLE (default), GENERIC, or FREEFORM
-     * @param passwd    password
-     */
-    public LedgerHandle createLedger(int ensSize, int qSize, QMode mode,  byte passwd[])
-        throws KeeperException, InterruptedException, 
-        IOException, BKException {
-        // Check that quorum size follows the minimum
-        long t;
-        LedgerHandle lh = null;
-        
-        switch(mode){
-        case VERIFIABLE:
-            t = java.lang.Math.round(java.lang.Math.floor((ensSize - 1)/2));
-            if(t == 0){
-                LOG.error("Tolerates 0 bookie failures"); 
-                throw BKException.create(Code.QuorumException);
-            }
-            break;
-        case GENERIC:
-            t = java.lang.Math.round(java.lang.Math.floor((ensSize - 1)/3));
-            if(t == 0){
-                LOG.error("Tolerates 0 bookie failures"); 
-                throw BKException.create(Code.QuorumException);
-            }
-            break;
-        case FREEFORM:
-            break;
-        }
-        /*
-         * Create ledger node on ZK.
-         * We get the id from the sequence number on the node.
-         */
-        String path = zk.create(BKDefs.prefix, new byte[0], 
-                Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT_SEQUENTIAL);
-        /* 
-         * Extract ledger id.
-         */
-        String parts[] = path.split("/");
-        String subparts[] = parts[2].split("L");
-        try{
-            long lId = Long.parseLong(subparts[1]);
-       
-            /* 
-             * Get children from "/ledgers/available" on zk
-             */
-            List<String> list = 
-                zk.getChildren("/ledgers/available", false);
-            ArrayList<InetSocketAddress> lBookies = new ArrayList<InetSocketAddress>();
-            /* 
-             * Select ensSize servers to form the ensemble
-             */
-            path = zk.create(BKDefs.prefix + getZKStringId(lId) + BKDefs.ensemble, new byte[0], 
-                    Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-         
-            /* 
-             * Add quorum size to ZK metadata
-             */
-            ByteBuffer bb = ByteBuffer.allocate(4);
-            bb.putInt(qSize);
-            zk.create(BKDefs.prefix + getZKStringId(lId) + BKDefs.quorumSize, bb.array(), 
-                    Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            /* 
-             * Quorum mode
-             */
-            bb = ByteBuffer.allocate(4);
-            bb.putInt(mode.ordinal());
-            zk.create(BKDefs.prefix + getZKStringId(lId) + BKDefs.quorumMode, bb.array(), 
-                    Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            /* 
-             * Create QuorumEngine
-             */
-            lh = new LedgerHandle(this, lId, 0, qSize, mode, passwd);
-            
-            /*
-             * Adding bookies to ledger handle
-             */
-            Random r = new Random();
-        
-            for(int i = 0; i < ensSize; i++){
-                int index = 0;
-                if(list.size() > 1) 
-                    index = r.nextInt(list.size() - 1);
-                else if(list.size() == 1)
-                    index = 0;
-                else {
-                    LOG.error("Not enough bookies available");
-        	    
-                    return null;
-                }
-            
-                try{
-                    String bookie = list.remove(index);
-                    LOG.info("Bookie: " + bookie);
-                    InetSocketAddress tAddr = parseAddr(bookie);
-                    int bindex = lh.addBookieForWriting(tAddr); 
-                    ByteBuffer bindexBuf = ByteBuffer.allocate(4);
-                    bindexBuf.putInt(bindex);
-        	    
-                    String pBookie = "/" + bookie;
-                    zk.create(BKDefs.prefix + getZKStringId(lId) + BKDefs.ensemble + pBookie, bindexBuf.array(), 
-                            Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-                } catch (IOException e) {
-                    LOG.error(e);
-                    i--;
-                } 
-            }
-            LOG.debug("Created new ledger");
-        } catch (NumberFormatException e) {
-            LOG.error("Error when parsing the ledger identifier", e);
-        }
-        // Return ledger handler
-        return lh; 
-    }
+public class BookKeeper implements OpenCallback, CreateCallback {
 
-    /**
-     * Creates a new ledger. Default of 3 servers, and quorum of 2 servers,
-     * verifiable ledger.
-     * 
-     * @param passwd	password
-     */
-    public LedgerHandle createLedger(byte passwd[])
-    throws KeeperException, BKException, 
-    InterruptedException, IOException {
-        return createLedger(3, 2, QMode.VERIFIABLE, passwd);
-    }
+  static final Logger LOG = Logger.getLogger(BookKeeper.class);
 
-    /**
-     * Asychronous call to create ledger
-     * 
-     * @param ensSize
-     * @param qSize
-     * @param mode
-     * @param passwd
-     * @param cb
-     * @param ctx
-     * @throws KeeperException
-     * @throws InterruptedException
-     * @throws IOException
-     * @throws BKException
-     */
-    public void asyncCreateLedger(int ensSize, 
-            int qSize, 
-            QMode mode,  
-            byte passwd[],
-            CreateCallback cb,
-            Object ctx
-            )
-    throws KeeperException, InterruptedException, 
-    IOException, BKException {
-        CreateLedgerOp op = new CreateLedgerOp(ensSize, 
-                qSize, 
-                mode, 
-                passwd, 
-                cb, 
-                ctx);
-        LedgerManagementProcessor lmp = getMngProcessor();
-        lmp.addOp(op);
-        
-    }
-    
-    /**
-     * Open existing ledger for reading. Default for quorum size is 2.
-     * 
-     * @param long  the long corresponding to the ledger id
-     * @param byte[]    byte array corresponding to the password to access a ledger
-     * @param int   the quorum size, it has to be at least ceil(n+1/2)
-     */
-    public LedgerHandle openLedger(long lId, byte passwd[])
-    throws KeeperException, InterruptedException, IOException, BKException {
-        
-        Stat stat = null;
-        
-        /*
-         * Check if ledger exists
-         */
-        if(zk.exists(BKDefs.prefix + getZKStringId(lId), false) == null){
-            LOG.error("Ledger " + getZKStringId(lId) + " doesn't exist.");
-            throw BKException.create(Code.NoSuchLedgerExistsException);
-        }
-        
-        /*
-         * Get quorum size.
-         */
-        ByteBuffer bb = ByteBuffer.wrap(zk.getData(BKDefs.prefix + getZKStringId(lId) + BKDefs.quorumSize, false, stat));
-        int qSize = bb.getInt();
-         
-        /*
-         * Get last entry written from ZK 
-         */
-        
-        long last = 0;
-        LOG.debug("Close path: " + BKDefs.prefix + getZKStringId(lId) + BKDefs.close);
-        if(zk.exists(BKDefs.prefix + getZKStringId(lId) + BKDefs.close, false) == null){
-            recoverLedger(lId, passwd);
-        }
-            
-        stat = null;
-        byte[] data = zk.getData(BKDefs.prefix + getZKStringId(lId) + BKDefs.close, false, stat);
-        ByteBuffer buf = ByteBuffer.wrap(data);
-        last = buf.getLong();
-        //zk.delete(BKDefs.prefix + getZKStringId(lId) + BKDefs.close, -1);
-        
-        /*
-         * Quorum mode 
-         */
-        data = zk.getData(BKDefs.prefix + getZKStringId(lId) + BKDefs.quorumMode, false, stat);
-        buf = ByteBuffer.wrap(data);
-        
-        QMode qMode;
-        switch(buf.getInt()){
-        case 1:
-            qMode = QMode.GENERIC;
-            LOG.info("Generic ledger");
-            break;
-        case 2:
-            qMode = QMode.FREEFORM;
-            break;
-        default:
-            qMode = QMode.VERIFIABLE;
-            LOG.info("Verifiable ledger");
-        }
-        
-        /*
-         *  Create QuorumEngine
-         */
-        LedgerHandle lh = new LedgerHandle(this, lId, last, qSize, qMode, passwd);
-        
-        /*
-         * Get children of "/ledgers/id/ensemble" 
-         */
-        
-        List<String> list = 
-            zk.getChildren(BKDefs.prefix + getZKStringId(lId) + BKDefs.ensemble, false);
-        
-        LOG.debug("Length of list of bookies: " + list.size());
-        for(int i = 0 ; i < list.size() ; i++){
-            for(String s : list){
-                LOG.debug("Extracting bookie: " + s);
-                byte[] bindex = zk.getData(BKDefs.prefix + getZKStringId(lId) + BKDefs.ensemble + "/" + s, false, stat);
-                ByteBuffer bindexBuf = ByteBuffer.wrap(bindex);
-                if(bindexBuf.getInt() == i){                      
-                    try{
-                        lh.addBookieForReading(parseAddr(s));
-                    } catch (IOException e){
-                        LOG.error(e);
-                    }
-                }
-            }
-        }
-        
-        /*
-         * Read changes to quorum over time. To determine if there has been changes during
-         * writes to the ledger, check if there is a znode called quorumEvolution.
-         */
-        if(zk.exists(BKDefs.prefix + 
-                getZKStringId(lh.getId()) +  
-                BKDefs.quorumEvolution, false) != null){
-                    String path = BKDefs.prefix + 
-                    getZKStringId(lh.getId()) +  
-                    BKDefs.quorumEvolution;
-                    
-                    List<String> faultList = zk.getChildren(path, false);
-                    try{
-                        for(String s : faultList){
-                            LOG.debug("Faulty list child: " + s);
-                            long entry = Long.parseLong(s);
-                            String addresses = new String(zk.getData(path + "/" + s, false, stat));
-                            String parts[] = addresses.split(" ");
-
-                            ArrayList<BookieHandle> newBookieSet = new ArrayList<BookieHandle>();
-                            for(int i = 0 ; i < parts.length ; i++){
-                                LOG.debug("Address: " + parts[i]);
-                                InetSocketAddress faultyBookie =  
-                                    parseAddr(parts[i].substring(1));                           
-                        
-                                newBookieSet.add(lh.getBookieHandleDup(faultyBookie));
-                            }
-                            lh.setNewBookieConfig(entry, newBookieSet);
-                            LOG.debug("NewBookieSet size: " + newBookieSet.size());
-                        }
-
-                        lh.prepareEntryChange();
-                    } catch (NumberFormatException e) {
-                        LOG.error("Error when parsing the ledger identifier", e);
-                    }
-                }
-      
-        /*
-         *  Return ledger handler
-         */
-        return lh;
-    }    
-    
-    public void asyncOpenLedger(long lId, byte passwd[], OpenCallback cb, Object ctx)
-    throws InterruptedException{
-        OpenLedgerOp op = new OpenLedgerOp(lId, 
-                passwd,  
-                cb, 
-                ctx);
-        LedgerManagementProcessor lmp = getMngProcessor();
-        lmp.addOp(op);
-    }
-    
-    /**
-     * Parses address into IP and port.
-     * 
-     *  @param addr	String
-     */
-    
-    InetSocketAddress parseAddr(String s){
-        String parts[] = s.split(":");
-        if (parts.length != 2) {
-            System.out.println(s
-                    + " does not have the form host:port");
-        }
-        InetSocketAddress addr = new InetSocketAddress(parts[0],
-                Integer.parseInt(parts[1]));
-        return addr;
-    }
-    
- 
-    /**
-     * Check if close node exists. 
-     * 
-     * @param ledgerId	id of the ledger to check
-     */
-    public boolean hasClosed(long ledgerId)
-    throws KeeperException, InterruptedException{
-        String closePath = BKDefs.prefix + getZKStringId(ledgerId) + BKDefs.close;
-        if(zk.exists(closePath, false) == null) return false;
-        else return true;
-    }
-    
-    /**
-     * Recover a ledger that was not closed properly.
-     * 
-     * @param lId	ledger identifier
-     * @param passwd	password
-     */
-    
-    boolean recoverLedger(long lId, byte passwd[])
-    throws KeeperException, InterruptedException, IOException, BKException {
-        
-        Stat stat = null;
-       
-        LOG.info("Recovering ledger");
-        
-        /*
-         * Get quorum size.
-         */
-        ByteBuffer bb = ByteBuffer.wrap(zk.getData(BKDefs.prefix + getZKStringId(lId) + BKDefs.quorumSize, false, stat));
-        int qSize = bb.getInt();
-                
-        
-        /*
-         * Get children of "/ledgers/id/ensemble" 
-         */
-        
-        List<String> list = 
-            zk.getChildren(BKDefs.prefix + getZKStringId(lId) + BKDefs.ensemble, false);
-        
-        ArrayList<InetSocketAddress> addresses = new ArrayList<InetSocketAddress>();
-        for(String s : list){
-            addresses.add(parseAddr(s));
-        }
-        
-        /*
-         * Quorum mode 
-         */
-        byte[] data = zk.getData(BKDefs.prefix + getZKStringId(lId) + BKDefs.quorumMode, false, stat);
-        ByteBuffer buf = ByteBuffer.wrap(data);
-        //int ordinal = buf.getInt();
-            
-        QMode qMode = QMode.VERIFIABLE;
-        switch(buf.getInt()){
-        case 0:
-            qMode = QMode.VERIFIABLE;
-            break;
-        case 1:
-            qMode = QMode.GENERIC;
-            break;
-        case 2:
-            qMode = QMode.FREEFORM;
-            break;
+  ZooKeeper zk = null;
+  // whether the zk handle is one we created, or is owned by whoever
+  // instantiated us
+  boolean ownZKHandle = false;
+
+  ClientSocketChannelFactory channelFactory;
+  // whether the socket factory is one we created, or is owned by whoever
+  // instantiated us
+  boolean ownChannelFactory = false;
+
+  BookieClient bookieClient;
+  BookieWatcher bookieWatcher;
+
+  OrderedSafeExecutor callbackWorker = new OrderedSafeExecutor(Runtime
+      .getRuntime().availableProcessors());
+  OrderedSafeExecutor mainWorkerPool = new OrderedSafeExecutor(Runtime
+      .getRuntime().availableProcessors());
+
+  /**
+   * Create a bookkeeper client. A zookeeper client and a client socket factory
+   * will be instantiated as part of this constructor.
+   * 
+   * @param servers
+   *          A list of one of more servers on which zookeeper is running. The
+   *          client assumes that the running bookies have been registered with
+   *          zookeeper under the path
+   *          {@link BookieWatcher#BOOKIE_REGISTRATION_PATH}
+   * @throws IOException
+   * @throws InterruptedException
+   * @throws KeeperException
+   */
+  public BookKeeper(String servers) throws IOException, InterruptedException,
+      KeeperException {
+    this(new ZooKeeper(servers, 10000, new Watcher() {
+      @Override
+      public void process(WatchedEvent event) {
+        // TODO: handle session disconnects and expires
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Process: " + event.getType() + " " + event.getPath());
         }
-        
-        /*
-         * Create ledger recovery monitor object
-         */
-        
-        LedgerRecoveryMonitor lrm = new LedgerRecoveryMonitor(this, lId, qSize, addresses, qMode);
-        
-        return lrm.recover(passwd);
+      }
+    }), new NioClientSocketChannelFactory(Executors.newCachedThreadPool(),
+        Executors.newCachedThreadPool()));
+
+    ownZKHandle = true;
+    ownChannelFactory = true;
+  }
+
+  /**
+   * Create a bookkeeper client but use the passed in zookeeper client instead
+   * of instantiating one.
+   * 
+   * @param zk
+   *          Zookeeper client instance connected to the zookeeper with which
+   *          the bookies have registered
+   * @throws InterruptedException
+   * @throws KeeperException
+   */
+  public BookKeeper(ZooKeeper zk) throws InterruptedException, KeeperException {
+    this(zk, new NioClientSocketChannelFactory(Executors.newCachedThreadPool(),
+        Executors.newCachedThreadPool()));
+    ownChannelFactory = true;
+  }
+
+  /**
+   * Create a bookkeeper client but use the passed in zookeeper client and
+   * client socket channel factory instead of instantiating those.
+   * 
+   * @param zk
+   *          Zookeeper client instance connected to the zookeeper with which
+   *          the bookies have registered
+   * @param channelFactory
+   *          A factory that will be used to create connections to the bookies
+   * @throws InterruptedException
+   * @throws KeeperException
+   */
+  public BookKeeper(ZooKeeper zk, ClientSocketChannelFactory channelFactory)
+      throws InterruptedException, KeeperException {
+    if (zk == null || channelFactory == null) {
+      throw new NullPointerException();
     }
-    
-    /**
-     * Get new bookies
-     * 
-     * @param addrList	list of bookies to replace
+    this.zk = zk;
+    this.channelFactory = channelFactory;
+    bookieWatcher = new BookieWatcher(this);
+    bookieWatcher.readBookiesBlocking();
+    bookieClient = new BookieClient(channelFactory, mainWorkerPool);
+  }
+
+  /**
+   * There are 2 digest types that can be used for verification. The CRC32 is
+   * cheap to compute but does not protect against byzantine bookies (i.e., a
+   * bookie might report fake bytes and a matching CRC32). The MAC code is more
+   * expensive to compute, but is protected by a password, i.e., a bookie can't
+   * report fake bytes with a mathching MAC unless it knows the password
+   */
+  public enum DigestType {
+    MAC, CRC32
+  };
+
+  public ZooKeeper getZkHandle() {
+    return zk;
+  }
+
+  /**
+   * Creates a new ledger asynchronously. To create a ledger, we need to specify
+   * the ensemble size, the quorum size, the digest type, a password, a callback
+   * implementation, and an optional control object. The ensemble size is how
+   * many bookies the entries should be striped among and the quorum size is the
+   * degree of replication of each entry. The digest type is either a MAC or a
+   * CRC. Note that the CRC option is not able to protect a client against a
+   * bookie that replaces an entry. The password is used not only to
+   * authenticate access to a ledger, but also to verify entries in ledgers.
+   * 
+   * @param ensSize
+   *          ensemble size
+   * @param qSize
+   *          quorum size
+   * @param digestType
+   *          digest type, either MAC or CRC32
+   * @param passwd
+   *          password
+   * @param cb
+   *          createCallback implementation
+   * @param ctx
+   *          optional control object
+   */
+  public void asyncCreateLedger(int ensSize, int qSize, DigestType digestType,
+      byte[] passwd, CreateCallback cb, Object ctx) {
+
+    new LedgerCreateOp(this, ensSize, qSize, digestType, passwd, cb, ctx)
+        .initiate();
+
+  }
+
+  /**
+   * Create callback implementation for synchronous create call.
+   * 
+   * @param rc
+   *          return code
+   * @param lh
+   *          ledger handle object
+   * @param ctx
+   *          optional control object
+   */
+  public void createComplete(int rc, LedgerHandle lh, Object ctx) {
+    SyncCounter counter = (SyncCounter) ctx;
+    counter.setLh(lh);
+    counter.setrc(rc);
+    counter.dec();
+  }
+
+  /**
+   * Creates a new ledger. Default of 3 servers, and quorum of 2 servers.
+   * 
+   * @param digestType
+   *          digest type, either MAC or CRC32
+   * @param passwd
+   *          password
+   * @return
+   * @throws KeeperException
+   * @throws InterruptedException
+   * @throws BKException
+   */
+  public LedgerHandle createLedger(DigestType digestType, byte passwd[])
+      throws KeeperException, BKException, InterruptedException, IOException {
+    return createLedger(3, 2, digestType, passwd);
+  }
+
+  /**
+   * Synchronous call to create ledger. Parameters match those of
+   * {@link #asyncCreateLedger(int, int, DigestType, byte[], CreateCallback, Object)}
+   * 
+   * @param ensSize
+   * @param qSize
+   * @param digestType
+   * @param passwd
+   * @return
+   * @throws KeeperException
+   * @throws InterruptedException
+   * @throws IOException
+   * @throws BKException
+   */
+  public LedgerHandle createLedger(int ensSize, int qSize,
+      DigestType digestType, byte passwd[]) throws KeeperException,
+      InterruptedException, IOException, BKException {
+    SyncCounter counter = new SyncCounter();
+    counter.inc();
+    /*
+     * Calls asynchronous version
      */
-    InetSocketAddress getNewBookie(ArrayList<InetSocketAddress> addrList)
-    throws InterruptedException {
-        try{
-            // Get children from "/ledgers/available" on zk
-            List<String> list = 
-                zk.getChildren("/ledgers/available", false);
-            ArrayList<InetSocketAddress> lBookies = new ArrayList<InetSocketAddress>();
-    
-            for(String addr : list){
-                InetSocketAddress nAddr = parseAddr(addr); 
-                if(!addrList.contains(nAddr) &&
-                        !bookieBlackList.contains(nAddr))
-                    return nAddr;
-            }
-        } catch (KeeperException e){
-            LOG.error("Problem accessing ZooKeeper: " + e);
-        }
-        
-        return null;
-    }
-    
-    HashMap<InetSocketAddress, BookieHandle> bhMap = 
-    	new HashMap<InetSocketAddress, BookieHandle>();
-    
-    /**
-     *  Keeps a list of available BookieHandle objects and returns
-     *  the corresponding object given an address.
-     *  
-     *  @param	a	InetSocketAddress
+    asyncCreateLedger(ensSize, qSize, digestType, passwd, this, counter);
+
+    /*
+     * Wait
      */
-    
-    synchronized BookieHandle getBookieHandle(LedgerHandle lh, InetSocketAddress a)
-    throws ConnectException, IOException {
-    	if(!bhMap.containsKey(a)){
-    	    BookieHandle bh = new BookieHandle(a, true); 
-    		bhMap.put(a, bh);
-    		bh.start();
-    	}
-    	bhMap.get(a).incRefCount(lh);
-    	
-    	return bhMap.get(a);
+    counter.block(0);
+    if (counter.getLh() == null) {
+      LOG.error("ZooKeeper error: " + counter.getrc());
+      throw BKException.create(Code.ZKException);
     }
-    
-    /**
-     * When there are no more references to a BookieHandle,
-     * remove it from the list. 
+
+    return counter.getLh();
+  }
+
+  /**
+   * Open existing ledger asynchronously for reading.
+   * 
+   * @param lId
+   *          ledger identifier
+   * @param digestType
+   *          digest type, either MAC or CRC32
+   * @param passwd
+   *          password
+   * @param ctx
+   *          optional control object
+   */
+  public void asyncOpenLedger(long lId, DigestType digestType, byte passwd[],
+      OpenCallback cb, Object ctx) {
+
+    new LedgerOpenOp(this, lId, digestType, passwd, cb, ctx).initiate();
+
+  }
+
+  /**
+   * Callback method for synchronous open operation
+   * 
+   * @param rc
+   *          return code
+   * @param lh
+   *          ledger handle
+   * @param ctx
+   *          optional control object
+   */
+  public void openComplete(int rc, LedgerHandle lh, Object ctx) {
+    SyncCounter counter = (SyncCounter) ctx;
+    counter.setLh(lh);
+
+    LOG.debug("Open complete: " + rc);
+
+    counter.setrc(rc);
+    counter.dec();
+  }
+
+  /**
+   * Synchronous open ledger call
+   * 
+   * @param lId
+   *          ledger identifier
+   * @param digestType
+   *          digest type, either MAC or CRC32
+   * @param passwd
+   *          password
+   * @return
+   * @throws InterruptedException
+   * @throws BKException
+   */
+
+  public LedgerHandle openLedger(long lId, DigestType digestType, byte passwd[])
+      throws BKException, InterruptedException {
+    SyncCounter counter = new SyncCounter();
+    counter.inc();
+
+    /*
+     * Calls async open ledger
      */
-    
-    synchronized void haltBookieHandles(LedgerHandle lh, ArrayList<BookieHandle> bookies){
-        while(bookies.size() > 0){
-            BookieHandle bh = bookies.remove(0);
-            if(bh.halt(lh) <= 0)
-                bhMap.remove(bh.addr);
-        }
-    }
-    
-    /**
-     * Blacklists bookies.
-     * 
-     * @param addr 	address of bookie
+    asyncOpenLedger(lId, digestType, passwd, this, counter);
+
+    /*
+     * Wait
      */
-    void blackListBookie(InetSocketAddress addr){
-        bookieBlackList.add(addr);
+    counter.block(0);
+    if (counter.getrc() != BKException.Code.OK)
+      throw BKException.create(counter.getrc());
+
+    return counter.getLh();
+  }
+
+  /**
+   * Shuts down client.
+   * 
+   */
+  public void halt() throws InterruptedException {
+    bookieClient.close();
+    bookieWatcher.halt();
+    if (ownChannelFactory) {
+      channelFactory.releaseExternalResources();
     }
-    
-    /**
-     * Halts all bookie handles
-     * 
-     */
-    public void halt() throws InterruptedException{
-        
-        for(BookieHandle bh: bhMap.values()){
-            bh.shutdown();
-        }
-        zk.close();
+    if (ownZKHandle) {
+      zk.close();
     }
+    callbackWorker.shutdown();
+    mainWorkerPool.shutdown();
+  }
 }

+ 0 - 371
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BookieHandle.java

@@ -1,371 +0,0 @@
-package org.apache.bookkeeper.client;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.nio.ByteBuffer;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.TimeUnit;
-import java.security.NoSuchAlgorithmException;
-import java.security.InvalidKeyException;
-import javax.crypto.Mac; 
-import javax.crypto.spec.SecretKeySpec;
-
-import org.apache.bookkeeper.client.BKException.Code;
-import org.apache.bookkeeper.client.LedgerHandle.QMode;
-import org.apache.bookkeeper.client.QuorumEngine.Operation;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.StopOp;
-import org.apache.bookkeeper.client.QuorumEngine.SubOp;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.AddOp;
-import org.apache.bookkeeper.client.QuorumEngine.SubOp.SubAddOp;
-import org.apache.bookkeeper.client.QuorumEngine.SubOp.SubReadOp;
-import org.apache.bookkeeper.client.QuorumEngine.SubOp.SubStopOp;
-import org.apache.bookkeeper.proto.BookieClient;
-import org.apache.log4j.Logger;
-
-
-/**
- * Maintains a queue of request to a given bookie. For verifiable
- * ledgers, it computes the digest.
- * 
- */
-
-public class BookieHandle extends Thread {
-    static Logger LOG = Logger.getLogger(BookieClient.class);
-    
-    volatile boolean stop = false;
-    boolean noreception = false;
-    private BookieClient client;
-    InetSocketAddress addr;
-    static int recvTimeout = 2000;
-    private ArrayBlockingQueue<ToSend> incomingQueue;
-    private int refCount = 0;
-    HashSet<LedgerHandle> ledgers;
-    
-    /**
-     * Objects of this class are queued waiting to be
-     * processed.
-     */
-    private static class ToSend {
-    	LedgerHandle lh;
-        long entry = -1;
-        Object ctx;
-        int type;
-        
-        ToSend(LedgerHandle lh, SubOp sop, long entry){
-        	this.lh = lh;
-            this.type = sop.op.type;
-            this.entry = entry;
-            this.ctx = sop;
-        }
-    }
-    
-    /**
-     * @param addr	address of the bookkeeper server that this
-     * handle should connect to.
-     */
-    BookieHandle(InetSocketAddress addr, boolean enabled) throws IOException {
-        this.stop = !enabled;
-        this.noreception = !enabled;
-        if(!stop)
-            this.client = new BookieClient(addr, recvTimeout);
-        else
-            this.client = null;
-        
-        this.addr = addr;
-        this.incomingQueue = new ArrayBlockingQueue<ToSend>(2000);
-        this.ledgers = new HashSet<LedgerHandle>();
-    }
-    
-    
-    /**
-     * Restart BookieClient if can't talk to bookie
-     * 
-     * @return
-     * @throws IOException
-     */
-    void restart() throws IOException {
-        this.client = new BookieClient(addr, recvTimeout);
-    }
-
-    /**
-     * Sending add operation to bookie. We have to synchronize the send to guarantee
-     * that requests will either get a response or throw an exception. 
-     * 
-     * @param r
-     * @param cb
-     * @param ctx
-     * @throws IOException
-     */
-    public synchronized void sendAdd(LedgerHandle lh, SubAddOp r, long entry)
-    throws IOException, BKException {
-        try{
-            if(!noreception){
-                ToSend ts = new ToSend(lh, r, entry);
-                if(!incomingQueue.offer(ts, 1000, TimeUnit.MILLISECONDS))
-                    throw BKException.create(Code.BookieHandleNotAvailableException);
-            } else {
-                throw BKException.create(Code.BookieHandleNotAvailableException);
-            }
-        } catch(InterruptedException e){
-            LOG.warn("Interrupted while waiting for room in the incoming queue");
-        }
-    }
-    
-    private synchronized void sendStop(){
-        try{
-            noreception = true;
-            LOG.debug("Sending stop signal");
-            incomingQueue.put(new ToSend(null, new SubStopOp(new StopOp()), -1));
-            LOG.debug("Sent stop signal");
-        } catch(InterruptedException e) {
-            LOG.fatal("Interrupted while sending stop signal to bookie handle");
-        }       
-    }
-    /**
-     * MAC instance
-     * 
-     */
-    Mac mac = null;
-    
-    Mac getMac(byte[] macKey, String alg)
-    throws NoSuchAlgorithmException, InvalidKeyException {
-        if(mac == null){
-            mac = Mac.getInstance(alg);
-            mac.init(new SecretKeySpec(macKey, "HmacSHA1"));
-        }
-        
-        return mac;
-    }
-    
-    /**
-     * Sending read operation to bookie
-     * 
-     * @param r
-     * @param entry
-     * @param cb
-     * @param ctx
-     * @throws IOException
-     */
-    
-    public synchronized void sendRead(LedgerHandle lh, SubReadOp r, long entry)
-    throws IOException, BKException {
-        try{
-            if(!noreception){           
-                ToSend ts = new ToSend(lh, r, entry);
-                if(!incomingQueue.offer(ts, 1000, TimeUnit.MILLISECONDS))
-                    throw BKException.create(Code.BookieHandleNotAvailableException);
-            } else {
-                throw BKException.create(Code.BookieHandleNotAvailableException);
-            }
-        } catch(InterruptedException e){
-            LOG.warn("Interrupted while waiting for room in the incoming queue");
-        }
-    }
-    
-    public void run(){
-        ToSend ts;
-        
-        try{
-            while(!stop){
-                ts = incomingQueue.poll(1000, TimeUnit.MILLISECONDS);
-                    
-                if(ts != null){
-                	LedgerHandle self = ts.lh;
-                    switch(ts.type){
-                    case Operation.STOP:
-                        LOG.info("Stopping BookieHandle: " + addr);
-                        client.errorOut();                   
-                        cleanQueue();
-                        LOG.debug("Stopped");
-                        break;
-                    case Operation.ADD:
-                        SubAddOp aOp = (SubAddOp) ts.ctx;
-                        AddOp op = ((AddOp) aOp.op);
-                        
-                        long confirmed = self.getAddConfirmed();
-                        ByteBuffer extendedData;
-    
-                        if(self.getQMode() == QMode.VERIFIABLE){
-                            extendedData = ByteBuffer.allocate(op.data.length + 28 + 16);
-                            extendedData.putLong(self.getId());
-                            extendedData.putLong(ts.entry);
-                            extendedData.putLong(confirmed);
-                            extendedData.put(op.data);
-                        
-                        
-                            extendedData.rewind();
-                            byte[] toProcess = new byte[op.data.length + 24];
-                            extendedData.get(toProcess, 0, op.data.length + 24);
-                            extendedData.position(extendedData.capacity() - 20);
-                            if(mac == null)
-                                getMac(self.getMacKey(), "HmacSHA1");
-                            extendedData.put(mac.doFinal(toProcess));
-                            extendedData.position(16);
-                        } else {
-                            extendedData = ByteBuffer.allocate(op.data.length + 8);
-                            extendedData.putLong(confirmed);
-                            extendedData.put(op.data);
-                            extendedData.flip();
-                        }
-                        
-                        client.addEntry(self.getId(),
-                                self.getLedgerKey(),
-                                ts.entry, 
-                                extendedData, 
-                                aOp.wcb,
-                                ts.ctx);
-                        break;
-                    case Operation.READ:
-                        if(client != null)
-                            client.readEntry(self.getId(),
-                                    ts.entry,
-                                    ((SubReadOp) ts.ctx).rcb,
-                                    ts.ctx);
-                        else ((SubReadOp) ts.ctx).rcb.readEntryComplete(-1, ts.lh.getId(), ts.entry, null, ts.ctx);
-                        break;
-                    }
-                } else LOG.debug("Empty queue: " + addr);
-            }
-        } catch (Exception e){
-            LOG.error("Handling exception before halting BookieHandle", e);
-            for(LedgerHandle lh : ledgers)
-                lh.removeBookie(this);
-            
-            /*
-             * We only need to synchronize when setting noreception to avoid that
-             * a client thread add another request to the incomingQueue after we
-             * have cleaned it.
-             */
-            synchronized(this){
-                noreception = true;
-            }
-            client.halt();
-            client.errorOut();
-            cleanQueue();
-        } 
-        
-        LOG.info("Exiting bookie handle thread: " + addr);
-    }
-        
-    
-    /**
-     * Multiple ledgers may use the same BookieHandle object, so we keep
-     * a count on the number of references.
-     */
-    int incRefCount(LedgerHandle lh){
-        ledgers.add(lh);
-        return ++refCount;
-    }
-    
-    /**
-     * Halts if there is no ledger using this object.
-     *
-     * @return  int reference counter
-     */
-    synchronized int halt(LedgerHandle lh){
-        LOG.info("Calling halt");
-        ledgers.remove(lh);
-        int currentCount = --refCount;
-        if(currentCount <= 0){
-            shutdown();
-        }
-        
-        if(currentCount < 0)
-            LOG.warn("Miscalculated the number of reference counts: " + addr);
-
-        return currentCount;
-    }
-    
-    /**
-     * Halt this bookie handle independent of the number of ledgers using it. Called upon a 
-     * failure to write. This method cannot be called by this thread because it may cause a
-     * deadlock as shutdown invokes sendStop. The deadlock comes from sendAdd blocking on
-     * incomingQueue when the queue is full and the thread also blocking on it when
-     * trying to send the stop marker. Because this thread is actually the consumer, if it
-     * does not make progress, then we have a deadlock. 
-     * 
-     * @return int  reference counter
-     */
-    synchronized public int halt(){
-        if(!stop){
-            LOG.info("Calling halt");
-            for(LedgerHandle lh : ledgers)
-                lh.removeBookie(this);
-            refCount = 0;
-            shutdown();
-        }
-     
-        return refCount;
-    }
-    
-    /**
-     * Stop this bookie handle completely.
-     * 
-     */
-    public void shutdown(){
-        if(!stop){
-            LOG.info("Calling shutdown");
-            LOG.debug("Halting client");
-            client.halt();
-            LOG.debug("Cleaning queue");
-            sendStop();
-            LOG.debug("Finished shutdown"); 
-        }
-    }
-    
-    /**
-     * Invokes the callback method for pending requests in the queue
-     * of this BookieHandle.
-     */
-    private void cleanQueue(){
-        stop = true;
-        ToSend ts = incomingQueue.poll();
-        while(ts != null){
-            switch(ts.type){
-            case Operation.ADD:
-                SubAddOp aOp = (SubAddOp) ts.ctx;
-                aOp.wcb.writeComplete(-1, ts.lh.getId(), ts.entry, ts.ctx);
-     
-                break;
-            case Operation.READ:                
-                ((SubReadOp) ts.ctx).rcb.readEntryComplete(-1, ts.lh.getId(), ts.entry, null, ts.ctx);
-                break;
-            }
-            ts = incomingQueue.poll();
-        }
-    }
-                
-    /**
-     * Returns the negated value of stop, which gives the status of the
-     * BookieHandle.
-     */
-    
-    boolean isEnabled(){
-        return !stop;
-    }
-}
-
-    

+ 204 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/BookieWatcher.java

@@ -0,0 +1,204 @@
+package org.apache.bookkeeper.client;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.Executors;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import org.apache.bookkeeper.client.BKException.BKNotEnoughBookiesException;
+import org.apache.bookkeeper.util.SafeRunnable;
+import org.apache.bookkeeper.util.StringUtils;
+import org.apache.log4j.Logger;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.WatchedEvent;
+import org.apache.zookeeper.Watcher;
+import org.apache.zookeeper.AsyncCallback.ChildrenCallback;
+import org.apache.zookeeper.KeeperException.Code;
+
+/**
+ * This class is responsible for maintaining a consistent view of what bookies
+ * are available by reading Zookeeper (and setting watches on the bookie nodes).
+ * When a bookie fails, the other parts of the code turn to this class to find a
+ * replacement
+ * 
+ */
+class BookieWatcher implements Watcher, ChildrenCallback {
+    static final Logger logger = Logger.getLogger(BookieWatcher.class);
+    
+    public static final String BOOKIE_REGISTRATION_PATH = "/ledgers/available";
+    static final Set<InetSocketAddress> EMPTY_SET = new HashSet<InetSocketAddress>();
+    public static int ZK_CONNECT_BACKOFF_SEC = 1;
+
+    BookKeeper bk;
+    ScheduledExecutorService scheduler;
+
+    Set<InetSocketAddress> knownBookies = new HashSet<InetSocketAddress>();
+
+    SafeRunnable reReadTask = new SafeRunnable() {
+        @Override
+        public void safeRun() {
+            readBookies();
+        }
+    };
+
+    public BookieWatcher(BookKeeper bk) {
+        this.bk = bk;
+        this.scheduler = Executors.newSingleThreadScheduledExecutor();
+    }
+    
+    public void halt(){
+        scheduler.shutdown();
+    }
+
+    public void readBookies() {
+        readBookies(this);
+    }
+
+    public void readBookies(ChildrenCallback callback) {
+        bk.getZkHandle().getChildren( BOOKIE_REGISTRATION_PATH, this, callback, null);
+    }
+
+    @Override
+    public void process(WatchedEvent event) {
+        readBookies();
+    }
+
+    @Override
+    public void processResult(int rc, String path, Object ctx, List<String> children) {
+
+        if (rc != KeeperException.Code.OK.intValue()) {
+            //logger.error("Error while reading bookies", KeeperException.create(Code.get(rc), path));
+            // try the read after a second again
+            scheduler.schedule(reReadTask, ZK_CONNECT_BACKOFF_SEC, TimeUnit.SECONDS);
+            return;
+        }
+
+        // Read the bookie addresses into a set for efficient lookup
+        Set<InetSocketAddress> newBookieAddrs = new HashSet<InetSocketAddress>();
+        for (String bookieAddrString : children) {
+            InetSocketAddress bookieAddr;
+            try {
+                bookieAddr = StringUtils.parseAddr(bookieAddrString);
+            } catch (IOException e) {
+                logger.error("Could not parse bookie address: " + bookieAddrString + ", ignoring this bookie");
+                continue;
+            }
+            newBookieAddrs.add(bookieAddr);
+        }
+
+        synchronized (this) {
+            knownBookies = newBookieAddrs;
+        }
+    }
+
+    /**
+     * Blocks until bookies are read from zookeeper, used in the {@link BookKeeper} constructor.
+     * @throws InterruptedException
+     * @throws KeeperException
+     */
+    public void readBookiesBlocking() throws InterruptedException, KeeperException {
+        final LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<Integer>();
+        readBookies(new ChildrenCallback() {
+            public void processResult(int rc, String path, Object ctx, List<String> children) {
+                try {
+                    BookieWatcher.this.processResult(rc, path, ctx, children);
+                    queue.put(rc);
+                } catch (InterruptedException e) {
+                    logger.error("Interruped when trying to read bookies in a blocking fashion");
+                    throw new RuntimeException(e);
+                }
+            }
+        });
+        int rc = queue.take();
+
+        if (rc != KeeperException.Code.OK.intValue()) {
+            throw KeeperException.create(Code.get(rc));
+        }
+    }
+
+    /**
+     * Wrapper over the {@link #getAdditionalBookies(Set, int)} method when there is no exclusion list (or exisiting bookies)
+     * @param numBookiesNeeded
+     * @return
+     * @throws BKNotEnoughBookiesException
+     */
+    public ArrayList<InetSocketAddress> getNewBookies(int numBookiesNeeded) throws BKNotEnoughBookiesException {
+        return getAdditionalBookies(EMPTY_SET, numBookiesNeeded);
+    }
+
+    /**
+     * Wrapper over the {@link #getAdditionalBookies(Set, int)} method when you just need 1 extra bookie
+     * @param existingBookies
+     * @return
+     * @throws BKNotEnoughBookiesException
+     */
+    public InetSocketAddress getAdditionalBookie(List<InetSocketAddress> existingBookies)
+            throws BKNotEnoughBookiesException {
+        return getAdditionalBookies(new HashSet<InetSocketAddress>(existingBookies), 1).get(0);
+    }
+
+    /**
+     * Returns additional bookies given an exclusion list and how many are needed
+     * @param existingBookies
+     * @param numAdditionalBookiesNeeded
+     * @return
+     * @throws BKNotEnoughBookiesException
+     */
+    public ArrayList<InetSocketAddress> getAdditionalBookies(Set<InetSocketAddress> existingBookies,
+            int numAdditionalBookiesNeeded) throws BKNotEnoughBookiesException {
+
+        ArrayList<InetSocketAddress> newBookies = new ArrayList<InetSocketAddress>();
+
+        if (numAdditionalBookiesNeeded <= 0) {
+            return newBookies;
+        }
+
+        List<InetSocketAddress> allBookies;
+
+        synchronized (this) {
+            allBookies = new ArrayList<InetSocketAddress>(knownBookies);
+        }
+
+        Collections.shuffle(allBookies);
+
+        for (InetSocketAddress bookie : allBookies) {
+            if (existingBookies.contains(bookie)) {
+                continue;
+            }
+
+            newBookies.add(bookie);
+            numAdditionalBookiesNeeded--;
+
+            if (numAdditionalBookiesNeeded == 0) {
+                return newBookies;
+            }
+        }
+
+        throw new BKNotEnoughBookiesException();
+    }
+
+}

+ 50 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/CRC32DigestManager.java

@@ -0,0 +1,50 @@
+package org.apache.bookkeeper.client;
+
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+
+import java.nio.ByteBuffer;
+import java.util.zip.CRC32;
+
+class CRC32DigestManager extends DigestManager {
+    CRC32 crc = new CRC32();
+    
+    public CRC32DigestManager(long ledgerId) {
+        super(ledgerId);
+    }
+
+    @Override
+    int getMacCodeLength() {
+        return 8;
+    }
+    
+    @Override
+    byte[] getValueAndReset() {
+        byte[] value = new byte[8];
+        ByteBuffer buf = ByteBuffer.wrap(value);
+        buf.putLong(crc.getValue());
+        crc.reset();
+        return value;
+    }
+    
+    @Override
+    void update(byte[] data, int offset, int length) {
+        crc.update(data, offset, length);
+    }
+}

+ 0 - 138
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/ClientCBWorker.java

@@ -1,138 +0,0 @@
-package org.apache.bookkeeper.client;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.bookkeeper.client.QuorumEngine.Operation;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.AddOp;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.ReadOp;
-import org.apache.log4j.Logger;
-
-/**
- * Thread responsible for delivering results to clients. This thread
- * basically isolates the application from the remainder of the
- * BookKeeper client. 
- * 
- */
-
-class ClientCBWorker extends Thread{
-    static Logger LOG = Logger.getLogger(ClientCBWorker.class);
-    static ClientCBWorker instance = null;
-    
-    private volatile boolean stop;
-    private static int instanceCounter= 0;
-    
-    ArrayBlockingQueue<Operation> pendingOps;
-    QuorumOpMonitor monitor;
-    
-    
-    static ClientCBWorker getInstance(){
-        if(instance == null){
-            instance = new ClientCBWorker();
-        }
-        instanceCounter++;
-        
-        return instance;
-    }
-    
-    /**
-     * Constructor initiates queue of pending operations and
-     * runs thread.
-     * 
-     */
-    ClientCBWorker(){
-       pendingOps = new ArrayBlockingQueue<Operation>(6000);  
-       stop = false;
-       start();
-       LOG.info("Have started cbWorker");
-    }
-    
-    
-    /**
-     * Adds operation to queue of pending.
-     * 
-     * @param   op  operation to add to queue
-     */
-    
-    void addOperation(Operation op) 
-    throws InterruptedException {
-        pendingOps.put(op);
-    }
-    
-    /**
-     * Gets thread out of its main loop.
-     * 
-     */
-    void shutdown(){
-        if((--instanceCounter) == 0){
-            stop = true;
-            instance = null;
-            LOG.info("Shutting down CBWorker");
-        }
-    }
-    
-    
-    /**
-     * Main thread loop.
-     * 
-     */
-    
-    public void run(){
-        try{
-            while(!stop){
-                Operation op = pendingOps.poll(1000, TimeUnit.MILLISECONDS);
-                if(op != null){
-                    synchronized(op){
-                        while(!op.isReady()){
-                            op.wait(1000);
-                        }
-                    }
-                    
-                    switch(op.type){
-                    case Operation.ADD:
-                        AddOp aOp = (AddOp) op;
-                       
-                        aOp.getLedger().setAddConfirmed(aOp.entry);
-                        aOp.cb.addComplete(aOp.getErrorCode(),
-                                aOp.getLedger(),
-                                aOp.entry, 
-                                aOp.ctx);
-                        
-                        break;
-                    case Operation.READ:
-                        ReadOp rOp = (ReadOp) op;
-                        rOp.cb.readComplete(rOp.getErrorCode(), 
-                                rOp.getLedger(),
-                                new LedgerSequence(rOp.seq), 
-                                rOp.ctx);
-                        break;
-                    }
-                } 
-            }
-        } catch (InterruptedException e){
-           LOG.error("Exception while waiting on queue or operation"); 
-        }
-    }
-}

+ 162 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/DigestManager.java

@@ -0,0 +1,162 @@
+package org.apache.bookkeeper.client;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.nio.ByteBuffer;
+import java.security.GeneralSecurityException;
+
+import org.apache.bookkeeper.client.BKException.BKDigestMatchException;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
+import org.apache.log4j.Logger;
+import org.jboss.netty.buffer.ChannelBuffer;
+import org.jboss.netty.buffer.ChannelBufferInputStream;
+import org.jboss.netty.buffer.ChannelBuffers;
+
+/**
+ * This class takes an entry, attaches a digest to it and packages it with relevant
+ * data so that it can be shipped to the bookie. On the return side, it also
+ * gets a packet, checks that the digest matches, and extracts the original entry
+ * for the packet. Currently 2 types of digests are supported: MAC (based on SHA-1) and CRC32
+ */
+
+abstract class DigestManager {
+    static final Logger logger = Logger.getLogger(DigestManager.class);
+
+    long ledgerId;
+    
+    abstract int getMacCodeLength();
+    
+    void update(byte[] data){
+        update(data, 0, data.length);
+    }
+    
+    abstract void update(byte[] data, int offset, int length);
+    abstract byte[] getValueAndReset();
+    
+    final int macCodeLength;
+
+    public DigestManager(long ledgerId) {
+        this.ledgerId = ledgerId;
+        macCodeLength = getMacCodeLength();
+    }
+    
+    static DigestManager instantiate(long ledgerId, byte[] passwd, DigestType digestType) throws GeneralSecurityException{
+        switch(digestType){
+        case MAC:
+            return new MacDigestManager(ledgerId, passwd);
+        case CRC32:
+            return new CRC32DigestManager(ledgerId);
+        default:
+            throw new GeneralSecurityException("Unknown checksum type: " + digestType);
+        }
+    }
+
+    ChannelBuffer computeDigestAndPackageForSending(long entryId, long lastAddConfirmed, byte[] data) {
+
+        byte[] bufferArray = new byte[24+macCodeLength];
+        ByteBuffer buffer = ByteBuffer.wrap(bufferArray);
+        buffer.putLong(ledgerId);
+        buffer.putLong(entryId);
+        buffer.putLong(lastAddConfirmed);
+        buffer.flip();
+
+        update(buffer.array(), 0, 24);
+        update(data);
+        byte[] digest = getValueAndReset();
+
+        buffer.limit(buffer.capacity());
+        buffer.position(24);
+        buffer.put(digest);
+        buffer.flip();
+
+        return ChannelBuffers.wrappedBuffer(ChannelBuffers.wrappedBuffer(buffer), ChannelBuffers.wrappedBuffer(data));
+    }
+
+    private void verifyDigest(ChannelBuffer dataReceived) throws BKDigestMatchException {
+        verifyDigest(-1, dataReceived, true);
+    }
+
+    private void verifyDigest(long entryId, ChannelBuffer dataReceived) throws BKDigestMatchException {
+        verifyDigest(entryId, dataReceived, false);
+    }
+
+    private void verifyDigest(long entryId, ChannelBuffer dataReceived, boolean skipEntryIdCheck)
+            throws BKDigestMatchException {
+
+        ByteBuffer dataReceivedBuffer = dataReceived.toByteBuffer();
+        byte[] digest;
+
+        update(dataReceivedBuffer.array(), dataReceivedBuffer.position(), 24);
+
+        int offset = 24 + macCodeLength;
+        update(dataReceivedBuffer.array(), dataReceivedBuffer.position() + offset, dataReceived.readableBytes() - offset);
+        digest = getValueAndReset();
+
+        for (int i = 0; i < digest.length; i++) {
+            if (digest[i] != dataReceived.getByte(24 + i)) {
+                logger.error("Mac mismatch for ledger-id: " + ledgerId + ", entry-id: " + entryId);
+                throw new BKDigestMatchException();
+            }
+        }
+
+        long actualLedgerId = dataReceived.readLong();
+        long actualEntryId = dataReceived.readLong();
+
+        if (actualLedgerId != ledgerId) {
+            logger.error("Ledger-id mismatch in authenticated message, expected: " + ledgerId + " , actual: "
+                    + actualLedgerId);
+            throw new BKDigestMatchException();
+        }
+
+        if (!skipEntryIdCheck && actualEntryId != entryId) {
+            logger.error("Entry-id mismatch in authenticated message, expected: " + entryId + " , actual: "
+                    + actualEntryId);
+            throw new BKDigestMatchException();
+        }
+
+    }
+
+    ChannelBufferInputStream verifyDigestAndReturnData(long entryId, ChannelBuffer dataReceived)
+            throws BKDigestMatchException {
+        verifyDigest(entryId, dataReceived);
+        dataReceived.readerIndex(24 + macCodeLength);
+        return new ChannelBufferInputStream(dataReceived);
+    }
+
+    static class RecoveryData {
+        long lastAddConfirmed;
+        long entryId;
+
+        public RecoveryData(long lastAddConfirmed, long entryId) {
+            this.lastAddConfirmed = lastAddConfirmed;
+            this.entryId = entryId;
+        }
+
+    }
+
+    RecoveryData verifyDigestAndReturnLastConfirmed(ChannelBuffer dataReceived) throws BKDigestMatchException {
+        verifyDigest(dataReceived);
+        dataReceived.readerIndex(8);
+
+        long entryId = dataReceived.readLong();
+        long lastAddConfirmed = dataReceived.readLong();
+        return new RecoveryData(lastAddConfirmed, entryId);
+
+    }
+}

+ 61 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/DistributionSchedule.java

@@ -0,0 +1,61 @@
+package org.apache.bookkeeper.client;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This interface determins how entries are distributed among bookies.
+ * 
+ * Every entry gets replicated to some number of replicas. The first replica for
+ * an entry is given a replicaIndex of 0, and so on. To distribute write load,
+ * not all entries go to all bookies. Given an entry-id and replica index, an
+ * {@link DistributionSchedule} determines which bookie that replica should go
+ * to.
+ */
+
+interface DistributionSchedule {
+
+    /**
+     * 
+     * @param entryId
+     * @param replicaIndex
+     * @return index of bookie that should get this replica
+     */
+    public int getBookieIndex(long entryId, int replicaIndex);
+
+    /**
+     * 
+     * @param entryId
+     * @param bookieIndex
+     * @return -1 if the given bookie index is not a replica for the given
+     *         entryId
+     */
+    public int getReplicaIndex(long entryId, int bookieIndex);
+
+    /**
+     * Specifies whether its ok to proceed with recovery given that we have
+     * heard back from the given bookie index. These calls will be a made in a
+     * sequence and an implementation of this interface should accumulate
+     * history about which bookie indexes we have heard from. Once this method
+     * has returned true, it wont be called again on the same instance
+     * 
+     * @param bookieIndexHeardFrom
+     * @return true if its ok to proceed with recovery
+     */
+    public boolean canProceedWithRecovery(int bookieIndexHeardFrom);
+}

+ 163 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerCreateOp.java

@@ -0,0 +1,163 @@
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+package org.apache.bookkeeper.client;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.security.GeneralSecurityException;
+import java.util.ArrayList;
+import org.apache.bookkeeper.client.AsyncCallback.CreateCallback;
+import org.apache.bookkeeper.client.BKException.BKNotEnoughBookiesException;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
+import org.apache.bookkeeper.util.StringUtils;
+import org.apache.log4j.Logger;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.AsyncCallback.StatCallback;
+import org.apache.zookeeper.AsyncCallback.StringCallback;
+import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.data.Stat;
+
+/**
+ * Encapsulates asynchronous ledger create operation
+ * 
+ */
+class LedgerCreateOp implements StringCallback, StatCallback {
+
+    static final Logger LOG = Logger.getLogger(LedgerCreateOp.class);
+
+    CreateCallback cb;
+    LedgerMetadata metadata;
+    LedgerHandle lh;
+    Object ctx;
+    byte[] passwd;
+    BookKeeper bk;
+    DigestType digestType;
+
+   /**
+    * Constructor
+    * 
+    * @param bk
+    *       BookKeeper object
+    * @param ensembleSize
+    *       ensemble size
+    * @param quorumSize
+    *       quorum size
+    * @param digestType
+    *       digest type, either MAC or CRC32
+    * @param passwd
+    *       passowrd
+    * @param cb
+    *       callback implementation
+    * @param ctx
+    *       optional control object
+    */
+
+    LedgerCreateOp(BookKeeper bk, int ensembleSize, int quorumSize, DigestType digestType, byte[] passwd, CreateCallback cb, Object ctx) {
+        this.bk = bk;
+        this.metadata = new LedgerMetadata(ensembleSize, quorumSize);
+        this.digestType = digestType;
+        this.passwd = passwd;
+        this.cb = cb;
+        this.ctx = ctx;
+    }
+
+    /**
+     * Initiates the operation
+     */
+    public void initiate() {
+        /*
+         * Create ledger node on ZK. We get the id from the sequence number on
+         * the node.
+         */
+
+        bk.getZkHandle().create(StringUtils.prefix, new byte[0], Ids.OPEN_ACL_UNSAFE,
+                CreateMode.PERSISTENT_SEQUENTIAL, this, null);
+
+        // calls the children callback method below
+    }
+
+
+    /**
+     * Implements ZooKeeper string callback.
+     * 
+     * @see org.apache.zookeeper.AsyncCallback.StringCallback#processResult(int, java.lang.String, java.lang.Object, java.lang.String)
+     */
+    public void processResult(int rc, String path, Object ctx, String name) {
+
+        if (rc != KeeperException.Code.OK.intValue()) {
+            LOG.error("Could not create node for ledger", KeeperException.create(KeeperException.Code.get(rc), path));
+            cb.createComplete(BKException.Code.ZKException, null, this.ctx);
+            return;
+        }
+
+        /*
+         * Extract ledger id.
+         */
+        long ledgerId;
+        try {
+            ledgerId = StringUtils.getLedgerId(name);
+        } catch (IOException e) {
+            LOG.error("Could not extract ledger-id from path:" + path, e);
+            cb.createComplete(BKException.Code.ZKException, null, this.ctx);
+            return;
+        }
+
+        /*
+         * Adding bookies to ledger handle
+         */
+
+        ArrayList<InetSocketAddress> ensemble;
+        try {
+            ensemble = bk.bookieWatcher.getNewBookies(metadata.ensembleSize);
+        } catch (BKNotEnoughBookiesException e) {
+            LOG.error("Not enough bookies to create ledger" + ledgerId);
+            cb.createComplete(e.getCode(), null, this.ctx);
+            return;
+        }
+
+        /*
+         * Add ensemble to the configuration
+         */
+        metadata.addEnsemble(new Long(0), ensemble);
+        try {
+            lh = new LedgerHandle(bk, ledgerId, metadata, digestType, passwd);
+        } catch (GeneralSecurityException e) {
+            LOG.error("Security exception while creating ledger: " + ledgerId, e);
+            cb.createComplete(BKException.Code.DigestNotInitializedException, null, this.ctx);
+            return;
+        }
+
+        lh.writeLedgerConfig(this, null);
+
+    }
+
+    /**
+     * Implements ZooKeeper stat callback.
+     * 
+     * @see org.apache.zookeeper.AsyncCallback.StatCallback#processResult(int, String, Object, Stat)
+     */
+    public void processResult(int rc, String path, Object ctx, Stat stat) {
+        cb.createComplete(rc, lh, this.ctx);
+    }
+
+}

+ 46 - 26
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerEntry.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.client;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,39 +21,58 @@ package org.apache.bookkeeper.client;
  * 
  */
 
-
+import java.io.IOException;
+import java.io.InputStream;
 
 import org.apache.log4j.Logger;
+import org.jboss.netty.buffer.ChannelBufferInputStream;
 
 /**
- * Ledger entry. Currently only holds the necessary
- * fields to identify a ledger entry, and the entry
- * content.
+ * Ledger entry. Its a simple tuple containing the ledger id, the entry-id, and
+ * the entry content.
  * 
  */
 
 public class LedgerEntry {
-    Logger LOG = Logger.getLogger(LedgerEntry.class);
-    
-    private long lId;
-    private long eId;
-    private byte[] entry;
-    
-    LedgerEntry(long lId, long eId, byte[] entry){
-        this.lId = lId;
-        this.eId = eId;
-        this.entry = entry;
-    }
-    
-    public long getLedgerId(){
-        return lId;
-    }
-    
-    public long getEntryId(){
-        return eId;
-    }
-    
-    public byte[] getEntry(){
-        return entry;
+  Logger LOG = Logger.getLogger(LedgerEntry.class);
+
+  long ledgerId;
+  long entryId;
+  ChannelBufferInputStream entryDataStream;
+
+  int nextReplicaIndexToReadFrom = 0;
+
+  LedgerEntry(long lId, long eId) {
+    this.ledgerId = lId;
+    this.entryId = eId;
+  }
+
+  public long getLedgerId() {
+    return ledgerId;
+  }
+
+  public long getEntryId() {
+    return entryId;
+  }
+
+  public byte[] getEntry() {
+    try {
+      // In general, you can't rely on the available() method of an input
+      // stream, but ChannelBufferInputStream is backed by a byte[] so it
+      // accurately knows the # bytes available
+      byte[] ret = new byte[entryDataStream.available()];
+      entryDataStream.readFully(ret);
+      return ret;
+    } catch (IOException e) {
+      // The channelbufferinput stream doesnt really throw the
+      // ioexceptions, it just has to be in the signature because
+      // InputStream says so. Hence this code, should never be reached.
+      LOG.fatal("Unexpected IOException while reading from channel buffer", e);
+      return new byte[0];
     }
+  }
+
+  public InputStream getEntryInputStream() {
+    return entryDataStream;
+  }
 }

+ 399 - 795
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerHandle.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.client;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,825 +21,428 @@ package org.apache.bookkeeper.client;
  * 
  */
 
-
-import java.io.IOException;
 import java.net.InetSocketAddress;
-import java.net.ConnectException;
-import java.nio.ByteBuffer;
-import java.security.NoSuchAlgorithmException;
-import java.security.MessageDigest;
+import java.security.GeneralSecurityException;
+import java.util.ArrayDeque;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.TreeMap;
-
-import org.apache.bookkeeper.client.BKDefs;
+import java.util.Enumeration;
+import java.util.Queue;
 import org.apache.bookkeeper.client.BKException;
-import org.apache.bookkeeper.client.BookieHandle;
 import org.apache.bookkeeper.client.AsyncCallback.AddCallback;
 import org.apache.bookkeeper.client.AsyncCallback.CloseCallback;
 import org.apache.bookkeeper.client.AsyncCallback.ReadCallback;
-import org.apache.bookkeeper.client.BKException.Code;
-import org.apache.bookkeeper.client.LedgerManagementProcessor.CloseLedgerOp;
-import org.apache.bookkeeper.client.QuorumEngine.Operation;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.AddOp;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.ReadOp;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.StopOp;
+import org.apache.bookkeeper.client.BKException.BKNotEnoughBookiesException;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
+import org.apache.bookkeeper.client.LedgerMetadata;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.GenericCallback;
+import org.apache.bookkeeper.util.SafeRunnable;
+import org.apache.bookkeeper.util.StringUtils;
+
 import org.apache.log4j.Logger;
-import org.apache.zookeeper.CreateMode;
+
 import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.AsyncCallback.StatCallback;
+import org.apache.zookeeper.data.Stat;
+import org.jboss.netty.buffer.ChannelBuffer;
 
 /**
- * Ledger handle on the client side. Contains ledger metadata
- * used to access it. This api exposes the read and write 
- * to a ledger and also exposes a streaming api for the ledger.
+ * Ledger handle contains ledger metadata and is used to access the read and
+ * write operations to a ledger.
  */
-public class LedgerHandle implements ReadCallback, AddCallback {
-    /**
-     * the call stack looks like --
-     * ledgerhandle->write->bookeeper->quorumengine->bookiehandle
-     * ->bookieclient
-     */
-   static Logger LOG = Logger.getLogger(LedgerHandle.class);
-    
-    public enum QMode {VERIFIABLE, GENERIC, FREEFORM};
-    
-    
-    private long ledger;
-    private volatile long last;
-    private volatile long lastAddConfirmed = 0;
-    private HashMap<Integer, Long> lastRecvCorrectly;
-    private volatile ArrayList<BookieHandle> bookies;
-    private ArrayList<InetSocketAddress> bookieAddrList;
-    private TreeMap<Long, ArrayList<BookieHandle> > bookieConfigMap;
-    private long[] entryChange;
-    private BookKeeper bk;
-    private QuorumEngine qe;
-    private int qSize;
-    private QMode qMode = QMode.VERIFIABLE;
-    private int lMode;
-
-    private int threshold;
-    private String digestAlg = "SHA1";
-    
-    private byte[] macKey;
-    private byte[] ledgerKey;
-    private byte[] passwd;
-    
-    /**
-     * @param bk the bookkeeper handle
-     * @param ledger the id for this ledger
-     * @param last the last id written 
-     * @param passwd the passwd to encode
-     * the entries
-     * @throws InterruptedException
-     */
-    LedgerHandle(BookKeeper bk, 
-            long ledger, 
-            long last,
-            byte[] passwd) throws InterruptedException {
-        this.bk = bk;
-        this.ledger = ledger;
-        this.last = last;
-        this.bookies = new ArrayList<BookieHandle>();
-        this.lastRecvCorrectly = new HashMap<Integer, Long>();
-        this.passwd = passwd;
-        genLedgerKey(passwd);
-        genMacKey(passwd);
-        this.qSize = (bookies.size() + 1)/2;
-        this.qe = new QuorumEngine(this);
-    }
-    
-    /**
-     * @param bk the bookkeeper handle
-     * @param ledger the id for this ledger
-     * @param last the last entree written
-     * @param qSize the queuing size 
-     * for this ledger
-     * @param mode the quueuing mode
-     * for this ledger
-     * @param passwd the passwd to encode
-     * @throws InterruptedException
-     */
-    LedgerHandle(BookKeeper bk, 
-            long ledger, 
-            long last,
-            int qSize, 
-            QMode mode,
-            byte[] passwd) throws InterruptedException {
-        this.bk = bk;
-        this.ledger = ledger;
-        this.last = last;
-        this.bookies = new ArrayList<BookieHandle>();
-        this.lastRecvCorrectly = new HashMap<Integer, Long>();
-
-
-        this.qSize = qSize;
-        this.qMode = mode;
-        this.passwd = passwd;
-        genLedgerKey(passwd);
-        genMacKey(passwd);
-        this.qe = new QuorumEngine(this);
-    }
-        
-    /**
-     * 
-     * @param bk the bookkeeper handle
-     * @param ledger the id for this ledger
-     * @param last the last entree written
-     * @param qSize the queuing size 
-     * for this ledger
-     * @param passwd the passwd to encode
-     * @throws InterruptedException
-     */
-    LedgerHandle(BookKeeper bk, 
-            long ledger, 
-            long last,
-            int qSize,
-            byte[] passwd) throws InterruptedException {
-        this.bk = bk;
-        this.ledger = ledger;
-        this.last = last;
-        this.bookies = new ArrayList<BookieHandle>();
-        this.lastRecvCorrectly = new HashMap<Integer, Long>();
-
-
-        this.qSize = qSize;
-        this.passwd = passwd;
-        genLedgerKey(passwd);
-        genMacKey(passwd);
-        this.qe = new QuorumEngine(this);
-    }
-    
-    private void setBookies(ArrayList<InetSocketAddress> bookies)
-    throws InterruptedException {
-    	try{
-    		for(InetSocketAddress a : bookies){
-    			LOG.debug("Opening bookieHandle: " + a);
-            
-    			//BookieHandle bh = new BookieHandle(this, a);
-    			this.bookies.add(bk.getBookieHandle(this, a));
-    		}
-    	} catch(ConnectException e){
-    		LOG.error(e);
-    		InetSocketAddress addr = bk.getNewBookie(bookies);
-    		if(addr != null){
-    		    bookies.add(addr);
-    		}
-    	} catch(IOException e) {
-    		LOG.error(e);
-    	}
-    }
-    
-    /**
-     * set the quorum engine
-     * @param qe the quorum engine
-     */
-    void setQuorumEngine(QuorumEngine qe) {
-        this.qe = qe;
-    }
-    
-    /** get the quorum engine
-     * @return return the quorum engine
-     */
-    QuorumEngine getQuorumEngine() {
-        return this.qe;
-    }
-    
-    /**
-     * Create bookie handle and add it to the list
-     * 
-     * @param addr	socket address
-     */
-    int addBookieForWriting(InetSocketAddress addr)
-    throws IOException {
-        LOG.debug("Bookie address: " + addr);
-        lMode = BKDefs.WRITE;
-        //BookieHandle bh = new BookieHandle(this, addr);
-        this.bookies.add(bk.getBookieHandle(this, addr));
-        if(bookies.size() > qSize) setThreshold();
-        return (this.bookies.size() - 1);
-    }
-    
-    /**
-     * Create bookie handle and add it to the list
-     * 
-     * @param addr  socket address
-     */
-    int addBookieForReading(InetSocketAddress addr)
-    throws IOException {
-        LOG.debug("Bookie address: " + addr);
-        lMode = BKDefs.READ;
-        //BookieHandle bh = new BookieHandle(this, addr);
-        try{
-            this.bookies.add(bk.getBookieHandle(this, addr));
-        } catch (IOException e){
-            LOG.info("Inserting a decoy bookie handle");
-            this.bookies.add(new BookieHandle(addr, false));
-        }
-        if(bookies.size() > qSize) setThreshold();
-        return (this.bookies.size() - 1);
-    }
+public class LedgerHandle implements ReadCallback, AddCallback, CloseCallback {
+  final static Logger LOG = Logger.getLogger(LedgerHandle.class);
 
-    
-    private void setThreshold() {
-        switch(qMode){
-        case GENERIC:
-            threshold = bookies.size() - qSize/2;
-            break;
-        case VERIFIABLE:
-            threshold = bookies.size() - qSize + 1;
-            break;
-        default:
-            threshold = bookies.size();
-        }
-        
-    }
-    
-    public int getThreshold() {
-        return threshold;
-    }
-    
-    
-    /**
-     * Writes to BookKeeper changes to the ensemble.
-     *         
-     * @param addr  Address of faulty bookie
-     * @param entry Last entry written before change of ensemble.
-     */
-    
-    void changeEnsemble(long entry){
-        String path = BKDefs.prefix + 
-        bk.getZKStringId(getId()) +  
-        BKDefs.quorumEvolution + "/" + 
-        String.format("%010d", entry);
-        
-        LOG.info("Report failure: " + String.format("%010d", entry));
-        try{
-            if(bk.getZooKeeper().exists(BKDefs.prefix + 
-                    bk.getZKStringId(getId()) +  
-                    BKDefs.quorumEvolution, false) == null)
-                bk.getZooKeeper().create(BKDefs.prefix + bk.getZKStringId(getId()) + 
-                        BKDefs.quorumEvolution, new byte[0], Ids.OPEN_ACL_UNSAFE, 
-                        CreateMode.PERSISTENT);
-        
-            boolean first = true;
-            String addresses = "";
-            for(BookieHandle bh : bookies){
-                if(first){ 
-                    addresses = bh.addr.toString();
-                    first = false;
-                }
-                else 
-                    addresses = addresses + " " + bh.addr.toString();
-            }
-            
-            bk.getZooKeeper() .create(path, addresses.getBytes(),
-                    Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-        } catch(Exception e){
-            LOG.error("Could not write to ZooKeeper: " + path + ", " + e);
-        }
-    }
-    
-    /**
-     * Replace bookie in the case of a failure 
-     */
-    void replaceBookie(int index) 
-    throws BKException {
-        InetSocketAddress addr = null;
-        try{
-            addr = bk.getNewBookie(bookieAddrList);
-        } catch(InterruptedException e){
-            LOG.error(e);
-        }
-        
-        if(addr == null){
-            throw BKException.create(Code.NoBookieAvailableException);
-        } else {           
-            try{
-                //BookieHandle bh = new BookieHandle(this, addr);
-                
-                /*
-                 * TODO: Read from current bookies, and write to this one
-                 */
-                
-                /*
-                 * If successful in writing to new bookie, add it to the set
-                 */
-                this.bookies.set(index, bk.getBookieHandle(this, addr));
-            } catch(ConnectException e){
-                bk.blackListBookie(addr);
-                LOG.error(e);
-            } catch(IOException e) {
-                bk.blackListBookie(addr);
-                LOG.error(e);
-            }
-        }
-    }
-    
-    /**
-     * This method is used when BK cannot find a bookie
-     * to replace the current faulty one. In such cases,
-     * we simply remove the bookie.
-     * 
-     * 
-     * @param BookieHandle
-     */
-    synchronized void removeBookie(BookieHandle bh){
-       if(lMode == BKDefs.WRITE){
-           LOG.info("Removing bookie: " + bh.addr);
-           int index = bookies.indexOf(bh);
-           if(index >= 0){
-               Long tmpLastRecv = lastRecvCorrectly.get(index);
-               bookies.remove(index);
-        
-               if(tmpLastRecv == null)
-                   changeEnsemble(0);
-               else
-                   changeEnsemble(tmpLastRecv);
-           }
-       }
-    }
-    
-    
-    /**
-     * Returns the ledger identifier
-     * @return long
-     */
-    public long getId(){
-        return ledger;
-    }
-    
-    /**
-     * Returns the last entry identifier submitted
-     * @return long
-     */
-    public long getLast(){
-        return last;   
-    }
-    
-    /**
-     * Returns the last entry identifier submitted and increments it.
-     * @return long
-     */
-    long incLast(){
-        return last++;
-    }
-    
-    /**
-     * Sets the last entry identifier submitted.
-     * 
-     * @param   last    last entry
-     * @return  long    returns the value just set
-     */
-    long setLast(long last){
-        this.last = last;
-        return this.last;
-    }
-    
-    /**
-     * Sets the value of the last add confirmed. This is used
-     * when adding new entries, since we use this value as a hint
-     * to recover from failures of the client.
-     */
-    void setAddConfirmed(long entryId){
-        if(entryId > lastAddConfirmed)
-            lastAddConfirmed = entryId;
-    }
-    
-    long getAddConfirmed(){
-        return lastAddConfirmed;
-    }
-    
-    void setLastRecvCorrectly(int sId, long entry){
-        //LOG.info("Setting last received correctly: " + entry);
-        lastRecvCorrectly.put(sId, entry);
-    }
-    
-    /**
-     * Returns the list of bookies
-     * @return ArrayList<BookieHandle>
-     */
-    ArrayList<BookieHandle> getBookies(){
-        return bookies;
-    }
-    
-    /**
-     * For reads, there might be multiple operations.
-     * 
-     * @param entry
-     * @return ArrayList<BookieHandle>  returns list of bookies
-     */
-    ArrayList<BookieHandle> getBookies(long entry){
-        return getConfig(entry);
-    }
-    
-    /**
-     * Returns the bookie handle corresponding to the addresses in the input.
-     * 
-     * @param addr
-     * @return
-     */
-    BookieHandle getBookieHandleDup(InetSocketAddress addr){
-        for(BookieHandle bh : bookies){
-            if(bh.addr.equals(addr))
-                return bh;
-        }
-        
-        return null;
-    }
-    
-    /**
-     * Sets a new bookie configuration corresponding to a failure during
-     * writes to the ledger. We have one configuration for every failure.
-     * 
-     * @param entry
-     * @param list
-     */
-    
-    void setNewBookieConfig(long entry, ArrayList<BookieHandle> list){
-        if(bookieConfigMap == null)
-            bookieConfigMap = new TreeMap<Long, ArrayList<BookieHandle> >();
-        
-        /*
-         * If initial config is not in the list, we include it.
-         */
-        if(!bookieConfigMap.containsKey(new Long(0))){
-            bookieConfigMap.put(new Long(0), bookies);
-        }
-        
-        LOG.info("Adding new entry: " + entry + ", " + bookies.size() + ", " + list.size());
-        bookieConfigMap.put(entry, list);
+  final byte[] ledgerKey;
+  final LedgerMetadata metadata;
+  final BookKeeper bk;
+  final long ledgerId;
+  long lastAddPushed;
+  long lastAddConfirmed;
+  final DigestManager macManager;
+  final DistributionSchedule distributionSchedule;
+
+  final Queue<PendingAddOp> pendingAddOps = new ArrayDeque<PendingAddOp>();
+
+  LedgerHandle(BookKeeper bk, long ledgerId, LedgerMetadata metadata,
+      DigestType digestType, byte[] password) throws GeneralSecurityException {
+    this.bk = bk;
+    this.metadata = metadata;
+    if (metadata.isClosed()) {
+      lastAddConfirmed = lastAddPushed = metadata.close;
+    } else {
+      lastAddConfirmed = lastAddPushed = -1;
     }
-    
-    /**
-     * Once we read all changes to the bookie configuration, we
-     * have to call this method to generate an array that we use
-     * to determine the bookie configuration for an entry.
-     * 
-     * Note that this array is a performance optimization and 
-     * it is not necessary for correctness. We could just use 
-     * bookieConfigMap but it would be slower.
-     */
-    
-    void prepareEntryChange(){
-        entryChange = new long[bookieConfigMap.size()];
-    
-        int counter = 0;
-        for(Long l : bookieConfigMap.keySet()){
-            entryChange[counter++] = l;
+
+    this.ledgerId = ledgerId;
+    macManager = DigestManager.instantiate(ledgerId, password, digestType);
+    this.ledgerKey = MacDigestManager.genDigest("ledger", password);
+    distributionSchedule = new RoundRobinDistributionSchedule(
+        metadata.quorumSize, metadata.ensembleSize);
+  }
+
+  /**
+   * Get the id of the current ledger
+   * 
+   * @return
+   */
+  public long getId() {
+    return ledgerId;
+  }
+
+  /**
+   * Get the last confirmed entry id on this ledger
+   * 
+   * @return
+   */
+  public long getLastAddConfirmed() {
+    return lastAddConfirmed;
+  }
+
+  /**
+   * Get the entry id of the last entry that has been enqueued for addition (but
+   * may not have possibly been persited to the ledger)
+   * 
+   * @return
+   */
+  public long getLastAddPushed() {
+    return lastAddPushed;
+  }
+
+  void writeLedgerConfig(StatCallback callback, Object ctx) {
+    bk.getZkHandle().setData(StringUtils.getLedgerNodePath(ledgerId),
+        metadata.serialize(), -1, callback, ctx);
+  }
+
+  /**
+   * Close this ledger synchronously.
+   * 
+   */
+  public void close() throws InterruptedException {
+    SyncCounter counter = new SyncCounter();
+    counter.inc();
+
+    asyncClose(this, counter);
+
+    counter.block(0);
+  }
+
+  /**
+   * Asynchronous close, any adds in flight will return errors
+   * 
+   * @param cb
+   *          callback implementation
+   * @param ctx
+   *          control object
+   * @throws InterruptedException
+   */
+  public void asyncClose(CloseCallback cb, Object ctx) {
+    asyncClose(cb, ctx, BKException.Code.LedgerClosedException);
+  }
+
+  /**
+   * Same as public version of asynClose except that this one takes an
+   * additional parameter which is the return code to hand to all the pending
+   * add ops
+   * 
+   * @param cb
+   * @param ctx
+   * @param rc
+   */
+  private void asyncClose(final CloseCallback cb, final Object ctx, final int rc) {
+
+    bk.mainWorkerPool.submitOrdered(ledgerId, new SafeRunnable() {
+
+      @Override
+      public void safeRun() {
+        // Close operation is idempotent, so no need to check if we are
+        // already closed
+        metadata.close(lastAddConfirmed);
+        errorOutPendingAdds(rc);
+        lastAddPushed = lastAddConfirmed;
+
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Closing ledger: " + ledgerId + " at entryId: "
+              + metadata.close);
         }
-    }
-    
-    /**
-     * Return the quorum size. By default, the size of a quorum is (n+1)/2, 
-     * where n is the size of the set of bookies.
-     * @return int
-     */
-    int getQuorumSize(){
-        return qSize;   
-    }
-    
-    
-    /**
-     *  Returns the config corresponding to the entry
-     *  
-     * @param entry
-     * @return
-     */
-    private ArrayList<BookieHandle> getConfig(long entry){
-        if(bookieConfigMap == null)
-            return bookies;
-        
-        int index = Arrays.binarySearch(entryChange, entry);
-        
-        /*
-         * If not on the map, binarySearch returns a negative value
-         */
-        int before = index;
-        index = index >= 0? index : ((-1) - index);
-
-        if(index == 0){
-            if((entry % 10) == 0){
-                LOG.info("Index: " + index + ", " + before + ", " + entry + ", " + bookieConfigMap.get(entryChange[index]).size());
+
+        writeLedgerConfig(new StatCallback() {
+          @Override
+          public void processResult(int rc, String path, Object subctx,
+              Stat stat) {
+            if (rc != KeeperException.Code.OK.intValue()) {
+              cb.closeComplete(BKException.Code.ZKException, LedgerHandle.this,
+                  ctx);
+            } else {
+              cb.closeComplete(BKException.Code.OK, LedgerHandle.this, ctx);
             }
-            return bookieConfigMap.get(entryChange[index]); 
-        } else{
-            //LOG.warn("IndexDiff " + entry);
-            return bookieConfigMap.get(entryChange[index - 1]);
-        }
-    }
-    
-    /**
-     * Returns the quorum mode for this ledger: Verifiable or Generic
-     */
-    QMode getQMode(){
-        return qMode;   
-    }
-    
-    /**
-     * Sets message digest algorithm.
-     */
-    
-    void setDigestAlg(String alg){
-        this.digestAlg = alg;
-    }
-    
-    /**
-     * Get message digest algorithm.
-     */
-    
-    String getDigestAlg(){
-        return digestAlg;
+          }
+        }, null);
+
+      }
+    });
+  }
+
+  /**
+   * Read a sequence of entries synchronously.
+   * 
+   * @param firstEntry
+   *          id of first entry of sequence (included)
+   * @param lastEntry
+   *          id of last entry of sequence (included)
+   * 
+   */
+  public Enumeration<LedgerEntry> readEntries(long firstEntry, long lastEntry)
+      throws InterruptedException, BKException {
+    SyncCounter counter = new SyncCounter();
+    counter.inc();
+
+    asyncReadEntries(firstEntry, lastEntry, this, counter);
+
+    counter.block(0);
+    if (counter.getrc() != BKException.Code.OK) {
+      throw BKException.create(counter.getrc());
     }
-    
-    /**
-     * Generates and stores Ledger key.
-     * 
-     * @param passwd
-     */
-    
-    private void genLedgerKey(byte[] passwd){
-        try{
-            MessageDigest digest = MessageDigest.getInstance("SHA");
-            String pad = "ledger";
-            
-            byte[] toProcess = new byte[passwd.length + pad.length()];
-            System.arraycopy(pad.getBytes(), 0, toProcess, 0, pad.length());
-            System.arraycopy(passwd, 0, toProcess, pad.length(), passwd.length);
-        
-            digest.update(toProcess);
-            this.ledgerKey = digest.digest();
-        } catch(NoSuchAlgorithmException e){
-            this.passwd = passwd;
-            LOG.error("Storing password as plain text because secure hash implementation does not exist");
-        }
+
+    return counter.getSequence();
+  }
+
+  /**
+   * Read a sequence of entries asynchronously.
+   * 
+   * @param firstEntry
+   *          id of first entry of sequence
+   * @param lastEntry
+   *          id of last entry of sequence
+   * @param cb
+   *          object implementing read callback interface
+   * @param ctx
+   *          control object
+   */
+  public void asyncReadEntries(long firstEntry, long lastEntry,
+      ReadCallback cb, Object ctx) {
+    // Little sanity check
+    if (firstEntry < 0 || lastEntry > lastAddConfirmed
+        || firstEntry > lastEntry) {
+      cb.readComplete(BKException.Code.ReadException, this, null, ctx);
+      return;
     }
-    
-    /**
-     * Generates and stores Mac key.
-     * 
-     * @param passwd
-     */
-    
-    private void genMacKey(byte[] passwd){
-        try{
-            MessageDigest digest = MessageDigest.getInstance("SHA");
-            String pad = "mac";
-            
-            byte[] toProcess = new byte[passwd.length + pad.length()];
-            System.arraycopy(pad.getBytes(), 0, toProcess, 0, pad.length());
-            System.arraycopy(passwd, 0, toProcess, pad.length(), passwd.length);
-        
-            digest.update(toProcess);
-            this.macKey = digest.digest();
-        } catch(NoSuchAlgorithmException e){
-            this.passwd = passwd;
-            LOG.error("Storing password as plain text because secure hash implementation does not exist");
+
+    new PendingReadOp(this, firstEntry, lastEntry, cb, ctx).initiate();
+
+  }
+
+  /**
+   * Add entry synchronously to an open ledger.
+   * 
+   * @param data
+   *         array of bytes to be written to the ledger
+   */
+
+  public long addEntry(byte[] data) throws InterruptedException, BKException {
+    LOG.debug("Adding entry " + data);
+    SyncCounter counter = new SyncCounter();
+    counter.inc();
+
+    asyncAddEntry(data, this, counter);
+    counter.block(0);
+
+    return counter.getrc();
+  }
+
+  /**
+   * Add entry asynchronously to an open ledger.
+   * 
+   * @param data
+   *          array of bytes to be written
+   * @param cb
+   *          object implementing callbackinterface
+   * @param ctx
+   *          some control object
+   */
+  public void asyncAddEntry(final byte[] data, final AddCallback cb,
+      final Object ctx) {
+    bk.mainWorkerPool.submitOrdered(ledgerId, new SafeRunnable() {
+      @Override
+      public void safeRun() {
+        if (metadata.isClosed()) {
+          LOG.warn("Attempt to add to closed ledger: " + ledgerId);
+          cb.addComplete(BKException.Code.LedgerClosedException,
+              LedgerHandle.this, -1, ctx);
+          return;
         }
+
+        long entryId = ++lastAddPushed;
+        PendingAddOp op = new PendingAddOp(LedgerHandle.this, cb, ctx, entryId);
+        pendingAddOps.add(op);
+        ChannelBuffer toSend = macManager.computeDigestAndPackageForSending(
+            entryId, lastAddConfirmed, data);
+        op.initiate(toSend);
+
+      }
+    });
+  }
+
+  // close the ledger and send fails to all the adds in the pipeline
+  void handleUnrecoverableErrorDuringAdd(int rc) {
+    asyncClose(NoopCloseCallback.instance, null, rc);
+  }
+
+  void errorOutPendingAdds(int rc) {
+    PendingAddOp pendingAddOp;
+    while ((pendingAddOp = pendingAddOps.poll()) != null) {
+      pendingAddOp.submitCallback(rc);
     }
-    
-    /**
-     * Returns password in plain text
-     */
-    byte[] getPasswd(){
-    	return passwd;
-    }
-    
-    
-    /**
-     * Returns MAC key
-     * 
-     * @return byte[]
-     */
-    byte[] getMacKey(){
-       return macKey; 
-    }
-   
-    /**
-     * Returns Ledger key
-     * 
-     * @return byte[]
-     */
-    byte[] getLedgerKey(){
-       return ledgerKey; 
-    }
-    
-    void closeUp(){
-        ledger = -1;
-        last = -1;
-        bk.haltBookieHandles(this, bookies);
-    }
-    
-    /**
-     * Close ledger.
-     * 
-     */
-    public void close() 
-    throws KeeperException, InterruptedException, BKException {
-        //Set data on zookeeper
-        ByteBuffer last = ByteBuffer.allocate(8);
-        last.putLong(lastAddConfirmed);
-        LOG.info("Last saved on ZK is: " + lastAddConfirmed);
-        String closePath = BKDefs.prefix + bk.getZKStringId(getId()) + BKDefs.close; 
-        if(bk.getZooKeeper().exists(closePath, false) == null){
-           bk.getZooKeeper().create(closePath, 
-                   last.array(), 
-                   Ids.OPEN_ACL_UNSAFE, 
-                   CreateMode.PERSISTENT); 
-        } 
-        
-        closeUp();
-        StopOp sOp = new StopOp();
-        qe.sendOp(sOp);
-        LOG.info("##### CB worker queue size: " + qe.cbWorker.pendingOps.size());
-    }
-    
-    /**
-     * Asynchronous close
-     *
-     * @param cb    callback implementation
-     * @param ctx   control object
-     * @throws InterruptedException
-     */
-    public void asyncClose(CloseCallback cb, Object ctx)
-    throws InterruptedException {
-        CloseLedgerOp op = new CloseLedgerOp(this, cb, ctx);
-        LedgerManagementProcessor lmp = bk.getMngProcessor();
-        lmp.addOp(op);  
-    }
-       
-    /**
-     * Read a sequence of entries asynchronously.
-     * 
-     * @param firstEntry    id of first entry of sequence
-     * @param lastEntry     id of last entry of sequence
-     * @param cb    object implementing read callback interface
-     * @param ctx   control object 
-     */
-    public void asyncReadEntries(long firstEntry, 
-            long lastEntry, ReadCallback cb, Object ctx)
-    throws BKException, InterruptedException {
-        // Little sanity check
-        if((firstEntry > getLast()) || (firstEntry > lastEntry)) 
-            throw BKException.create(Code.ReadException);
-        
-        Operation r = new ReadOp(this, firstEntry, lastEntry, cb, ctx);
-        qe.sendOp(r); 
-        //qeMap.get(lh.getId()).put(r);
+  }
+
+  void sendAddSuccessCallbacks() {
+    // Start from the head of the queue and proceed while there are
+    // entries that have had all their responses come back
+    PendingAddOp pendingAddOp;
+    while ((pendingAddOp = pendingAddOps.peek()) != null) {
+      if (pendingAddOp.numResponsesPending != 0) {
+        return;
+      }
+      pendingAddOps.remove();
+      lastAddConfirmed = pendingAddOp.entryId;
+      pendingAddOp.submitCallback(BKException.Code.OK);
     }
-    
-    
-    /**
-     * Read a sequence of entries synchronously.
-     * 
-     * @param firstEntry    id of first entry of sequence
-     * @param lastEntry     id of last entry of sequence
-     *
-     */
-    public LedgerSequence readEntries(long firstEntry, long lastEntry) 
-    throws InterruptedException, BKException {
-        // Little sanity check
-        if((firstEntry > getLast()) || (firstEntry > lastEntry))
-            throw BKException.create(Code.ReadException);
-        
-        RetCounter counter = new RetCounter();
-        counter.inc();
-     
-        Operation r = new ReadOp(this, firstEntry, lastEntry, this, counter);
-        qe.sendOp(r);
-        
-        LOG.debug("Going to wait for read entries: " + counter.i);
-        counter.block(0);
-        LOG.debug("Done with waiting: " + counter.i + ", " + firstEntry);
-        
-        if(counter.getSequence() == null){
-            LOG.error("Failed to read entries: " + firstEntry + ", " + lastEntry);
-            throw BKException.create(Code.ReadException);
-        }
-        return counter.getSequence();
+
+  }
+
+  void handleBookieFailure(InetSocketAddress addr, final int bookieIndex) {
+    InetSocketAddress newBookie;
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Handling failure of bookie: " + addr + " index: "
+          + bookieIndex);
     }
-   
-    /**
-     * Add entry asynchronously to an open ledger.
-     * 
-     * @param data  array of bytes to be written
-     * @param cb    object implementing callbackinterface
-     * @param ctx   some control object
-     */
-    public void asyncAddEntry(byte[] data, AddCallback cb, Object ctx)
-    throws InterruptedException, BKException {
-        AddOp r = new AddOp(this, data, cb, ctx);
-        qe.sendOp(r);
+
+    try {
+      newBookie = bk.bookieWatcher
+          .getAdditionalBookie(metadata.currentEnsemble);
+    } catch (BKNotEnoughBookiesException e) {
+      LOG
+          .error("Could not get additional bookie to remake ensemble, closing ledger: "
+              + ledgerId);
+      handleUnrecoverableErrorDuringAdd(e.getCode());
+      return;
     }
-    
-    
-    /**
-     * Add entry synchronously to an open ledger.
-     * 
-     * @param   data byte[]
-     */
-    
-    public long addEntry(byte[] data)
-    throws InterruptedException, BKException{
-        LOG.debug("Adding entry " + data);
-        RetCounter counter = new RetCounter();
-        counter.inc();
-        
-        Operation r = new AddOp(this, data, this, counter);
-        qe.sendOp(r);   
-        //qeMap.get(lh.getId()).put(r);
-        counter.block(0);
-        return counter.getrc();
+
+    final ArrayList<InetSocketAddress> newEnsemble = new ArrayList<InetSocketAddress>(
+        metadata.currentEnsemble);
+    newEnsemble.set(bookieIndex, newBookie);
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Changing ensemble from: " + metadata.currentEnsemble + " to: "
+          + newEnsemble + " for ledger: " + ledgerId + " starting at entry: "
+          + (lastAddConfirmed + 1));
     }
-    
-    
-    /**
-     * Implementation of callback interface for synchronous read method.
-     * 
-     * @param rc    return code
-     * @param leder ledger identifier
-     * @param seq   sequence of entries
-     * @param ctx   control object
-     */
-    public void readComplete(int rc, 
-            LedgerHandle lh,
-            LedgerSequence seq,  
-            Object ctx){        
-        
-        RetCounter counter = (RetCounter) ctx;
-        counter.setSequence(seq);
-        LOG.debug("Read complete: " + seq.size() + ", " + counter.i);
-        counter.dec();
+
+    metadata.addEnsemble(lastAddConfirmed + 1, newEnsemble);
+
+    writeLedgerConfig(new StatCallback() {
+      @Override
+      public void processResult(final int rc, String path, Object ctx, Stat stat) {
+
+        bk.mainWorkerPool.submitOrdered(ledgerId, new SafeRunnable() {
+          @Override
+          public void safeRun() {
+            if (rc != KeeperException.Code.OK.intValue()) {
+              LOG
+                  .error("Could not persist ledger metadata while changing ensemble to: "
+                      + newEnsemble + " , closing ledger");
+              handleUnrecoverableErrorDuringAdd(BKException.Code.ZKException);
+              return;
+            }
+
+            for (PendingAddOp pendingAddOp : pendingAddOps) {
+              pendingAddOp.unsetSuccessAndSendWriteRequest(bookieIndex);
+            }
+          }
+        });
+
+      }
+    }, null);
+
+  }
+
+  void recover(GenericCallback<Void> cb) {
+    if (metadata.isClosed()) {
+      // We are already closed, nothing to do
+      cb.operationComplete(BKException.Code.OK, null);
+      return;
     }
-    
-    /**
-     * Implementation of callback interface for synchronous read method.
-     * 
-     * @param rc    return code
-     * @param leder ledger identifier
-     * @param entry entry identifier
-     * @param ctx   control object
-     */
-    public void addComplete(int rc, 
-            LedgerHandle lh,
-            long entry, 
-            Object ctx){          
-        RetCounter counter = (RetCounter) ctx;
-        
-        counter.setrc(rc);
-        counter.dec();
+
+    new LedgerRecoveryOp(this, cb).initiate();
+  }
+
+  static class NoopCloseCallback implements CloseCallback {
+    static NoopCloseCallback instance = new NoopCloseCallback();
+
+    @Override
+    public void closeComplete(int rc, LedgerHandle lh, Object ctx) {
+      // noop
     }
-    
-    
-    
-    /**
-     * Implements objects to help with the synchronization of asynchronous calls
-     * 
-     */
-    
-    private static class RetCounter {
-        int i;
-        int rc;
-        int total;
-        LedgerSequence seq = null;
-        
-        synchronized void inc() {
-            i++;
-            total++;
-        }
-        synchronized void dec() {
-            i--;
-            notifyAll();
-        }
-        synchronized void block(int limit) throws InterruptedException {
-            while(i > limit) {
-                int prev = i;
-                wait(15000);
-                if(i == prev){
-                    break;
-                }
-            }
-        }
-        synchronized int total() {
-            return total;
-        }
-        
-        void setrc(int rc){
-            this.rc = rc;
-        }
-        
-        int getrc(){
-            return rc;
-        }
-        
-        void setSequence(LedgerSequence seq){
-            this.seq = seq;
-        }
-        
-        LedgerSequence getSequence(){
-            return seq;
-        }
+  }
+
+  /**
+   * Implementation of callback interface for synchronous read method.
+   * 
+   * @param rc
+   *          return code
+   * @param leder
+   *          ledger identifier
+   * @param seq
+   *          sequence of entries
+   * @param ctx
+   *          control object
+   */
+  public void readComplete(int rc, LedgerHandle lh,
+      Enumeration<LedgerEntry> seq, Object ctx) {
+
+    SyncCounter counter = (SyncCounter) ctx;
+    synchronized (counter) {
+      counter.setSequence(seq);
+      counter.setrc(rc);
+      counter.dec();
+      counter.notify();
+    }
+  }
+
+  /**
+   * Implementation of callback interface for synchronous read method.
+   * 
+   * @param rc
+   *          return code
+   * @param leder
+   *          ledger identifier
+   * @param entry
+   *          entry identifier
+   * @param ctx
+   *          control object
+   */
+  public void addComplete(int rc, LedgerHandle lh, long entry, Object ctx) {
+    SyncCounter counter = (SyncCounter) ctx;
+
+    counter.setrc(rc);
+    counter.dec();
+  }
+
+  /**
+   * Close callback method
+   * 
+   * @param rc
+   * @param lh
+   * @param ctx
+   */
+  public void closeComplete(int rc, LedgerHandle lh, Object ctx) {
+
+    SyncCounter counter = (SyncCounter) ctx;
+    counter.setrc(rc);
+    synchronized (counter) {
+      counter.dec();
+      counter.notify();
     }
+
+  }
 }

+ 0 - 1272
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerManagementProcessor.java

@@ -1,1272 +0,0 @@
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-package org.apache.bookkeeper.client;
-
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Random;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.bookkeeper.client.AsyncCallback.CloseCallback;
-import org.apache.bookkeeper.client.AsyncCallback.CreateCallback;
-import org.apache.bookkeeper.client.AsyncCallback.OpenCallback;
-import org.apache.bookkeeper.client.BKException.Code;
-import org.apache.bookkeeper.client.LedgerHandle.QMode;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.StopOp;
-import org.apache.zookeeper.AsyncCallback.StatCallback;
-import org.apache.zookeeper.AsyncCallback.StringCallback;
-import org.apache.zookeeper.AsyncCallback.ChildrenCallback;
-import org.apache.zookeeper.AsyncCallback.DataCallback;
-import org.apache.zookeeper.CreateMode;
-import org.apache.zookeeper.data.Stat;
-import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.ZooDefs.Ids;
-import org.apache.zookeeper.ZooKeeper;
-
-import org.apache.log4j.Logger;
-
-public class LedgerManagementProcessor 
-extends Thread 
-implements StatCallback, StringCallback, ChildrenCallback, DataCallback {
-    
-   Logger LOG = Logger.getLogger(LedgerManagementProcessor.class);
-    
-    static final int MAXATTEMPTS = 3;
-    
-    /**
-     *  Types of ledger operations: CREATE, OPEN, CLOSE
-     */
-    static enum OpType {CREATE, OPEN, CLOSE};
-    
-    /**
-     * Operation descriptor for asynchronous execution. 
-     *
-     */
-    static class LedgerOp {
-        private OpType op;
-        private int action;
-        private int rc = 0;
-        private Object ctx;
-        private LedgerHandle lh;
-
-        /**
-         * Constructor sets the operation type.
-         * 
-         * @param op    operation type
-         */
-        LedgerOp(OpType op, Object ctx){
-            this.op = op;
-            this.ctx = ctx;
-            this.action = 0;
-        }        
-        
-        /**
-         * Returns the operation type.
-         * 
-         * @return OpType
-         */
-        OpType getType(){
-            return op;
-        }
-
-        /**
-         * Set value of action
-         * 
-         * @return int  return action identifier
-         */
-        int setAction(int action){
-            return this.action = action;
-        }
-        
-        /**
-         * Return value of action
-         * 
-         * @return  int  return action identifier
-         */
-        int getAction(){
-            return action;
-        }
-        
-        /**
-         * Set the return code
-         * 
-         * @param rc return code
-         */
-        void setRC(int rc){
-            this.rc = rc;
-        }
-        
-        /**
-         * Return return code
-         * 
-         * @return int return code
-         */
-        int getRC(){
-            return rc;
-        }
-        
-        /**
-         * Return control object
-         * 
-         * @return Object   control object
-         */
-        Object getCtx(){
-            return ctx;
-        }
-        
-        /**
-         * Set ledger handle
-         * 
-         * @param lh ledger handle
-         */
-        
-        void setLh(LedgerHandle lh){
-            this.lh = lh;
-        }
-        
-        /**
-         * Return ledger handle
-         * 
-         * @return LedgerHandle ledger handle
-         */
-        LedgerHandle getLh(){
-            return this.lh;
-        }
-    }
-
-    /**
-     * Create ledger descriptor for asynchronous execution.
-     */
-    static class CreateLedgerOp extends LedgerOp {
-        private long lId;
-        private int ensSize;
-        private int qSize; 
-        private QMode mode;  
-        private byte passwd[];
-        
-        private CreateCallback cb;
-        
-        private List<String> available;
-        private String path;
-        
-        AtomicInteger zkOpCounter;
-        
-        /**
-         * Constructor of request to create a new ledger.
-         * 
-         * @param ensSize   ensemble size
-         * @param qSize     quorum size
-         * @param mode      quorum mode (VERIFIABLE or GENERIC)
-         * @param passwd    password
-         * @param cb        create callback implementation
-         * @param ctx       control object
-         */
-        CreateLedgerOp(int ensSize,
-                int qSize, 
-                QMode mode,  
-                byte passwd[],
-                CreateCallback cb,
-                Object ctx)
-                throws BKException{
-            super(OpType.CREATE, ctx);
-            this.ensSize = ensSize;
-            this.qSize = qSize;
-            this.mode = mode;
-            this.passwd = passwd;
-            this.cb = cb;
-            
-            /*
-             * There are 5 fixed ZK operations, and a variable
-             * number to set the bookies of the new ledger. We
-             * initialize it with 5 and increment as we add bookies
-             * in action 2. 
-             */
-            this.zkOpCounter = new AtomicInteger(5);
-            
-            // Check that quorum size follows the minimum
-            long t;
-            switch(mode){
-            case VERIFIABLE:
-                t = java.lang.Math.round(java.lang.Math.floor((ensSize - 1)/2));
-                if(t == 0){
-                    throw BKException.create(Code.QuorumException);
-                }
-                break;
-            case GENERIC:
-                t = java.lang.Math.round(java.lang.Math.floor((ensSize - 1)/3));
-                if(t == 0){
-                    throw BKException.create(Code.QuorumException);
-                }
-                break;
-            case FREEFORM:
-                break;
-            }
-        }
-        
-        /**
-         * Constructor for cloning. This is necessary because there
-         * are create actions that issue multiple ZK operations, and 
-         * when we queue back the result of the operation we need the
-         * operation object to reflect the result of the operation.
-         * 
-         * @param op
-         */
-        CreateLedgerOp(CreateLedgerOp op){
-            super(OpType.CREATE, op.getCtx());
-            setRC(op.getRC());
-            setAction(op.getAction());
-            
-            this.setLh(op.getLh());
-            this.lId = op.getLid();
-            this.ensSize = op.getEnsembleSize();
-            this.qSize = op.getQuorumSize();
-            this.mode = op.getMode();
-            this.passwd = op.getPasswd();
-            this.cb = op.getCb();
-            this.available = op.getAvailable();
-            this.path = op.getPath(); 
-            this.zkOpCounter = op.zkOpCounter;
-        }
-        
-        /**
-         * Set ledger identifier (sequence number
-         * of ZooKeeper)
-         * 
-         * @param lId
-         */
-        void setLid(long lId){
-            this.lId = lId;
-        }
-        
-        /**
-         * Return ledger identifier
-         * 
-         * @return long ledger identifier
-         */
-        long getLid(){
-            return lId;
-        }
-        
-        /**
-         * Return ensemble size
-         * 
-         * @return int ensemble size
-         */
-        int getEnsembleSize(){
-            return ensSize;
-        }
-        
-        /**
-         * Return quorum size
-         * 
-         * @return int quorum size
-         */
-        int getQuorumSize(){
-           return qSize; 
-        }
-        
-        /**
-         * Return quorum mode
-         * 
-         * @return  QMode   quorum mode
-         */
-        QMode getMode(){
-            return mode;   
-        }
-        
-        /**
-         * Return password
-         * 
-         * @return byte[] passwd
-         */
-        byte[] getPasswd(){
-            return passwd;
-        }
-        
-        /**
-         * Return callback implementation
-         * 
-         * @return CreateCallback   callback implementation
-         */
-        CreateCallback getCb(){
-            return cb;
-        }
-        
-        
-        
-        /**
-         * Set the list of available bookies for processing
-         * 
-         * @param available lost of available bookies
-         */
-        void addAvailable(List<String> available){
-            this.available = available;
-        }
-        
-        /**
-         * Return list of bookies available
-         * 
-         * @return List<String> list of bookies available
-         */
-        List<String> getAvailable(){
-            return available;
-        }
-        
-        /**
-         * Set path as returned in the callback
-         * 
-         * @param path  created path
-         */
-        void setPath(String path){
-            this.path = path;
-        }
-
-        /**
-         * Return path
-         * 
-         * @return String   path
-         */
-        String getPath(){
-            return path;
-        }
-    }
-    
-    /**
-     * Open ledger descriptor for asynchronous execution.
-     */
-    static class OpenLedgerOp extends LedgerOp {
-        private long lId; 
-        private byte passwd[];
-        private OpenCallback cb;
-        
-        private int qSize;
-        private long last;
-        private QMode qMode;
-        private List<String> children;
-        
-        private String dataString;
-        private String item;
-        private AtomicInteger counter;
-
-        /**
-         * Constructor of request to open a ledger.
-         * 
-         * @param lId   ledger identifier
-         * @param passwd    password to access ledger
-         */
-        OpenLedgerOp(long lId, 
-                byte passwd[],
-                OpenCallback cb,
-                Object ctx){
-            super(OpType.OPEN, ctx);
-            this.lId = lId;
-            this.passwd = passwd;
-        }
-        
-        /**
-         * Return ledger identifier
-         * 
-         * @return long
-         */
-        long getLid(){
-            return lId;
-        }
-        
-        /**
-         * Return password
-         * @return byte[]
-         */
-        byte[] getPasswd(){
-            return passwd;
-        }
-
-        /**
-         * Return callback object
-         * 
-         * @return OpenCallback 
-         */
-        OpenCallback getCb(){
-            return this.cb;
-        }
-             
-        /**
-         * Set quorum size as extracted from ZK
-         * 
-         * @param data  znode data
-         */
-        void setQSize(byte[] data){
-            ByteBuffer bb = ByteBuffer.wrap(data);
-            this.qSize = bb.getInt();
-        }
-        
-        /**
-         * Return quorum size
-         * 
-         * @return  int quorum size
-         */
-        int getQSize(){
-            return qSize;
-        }
-        
-        /**
-         * Set last value as read from close znode
-         * 
-         * @param last
-         */
-        void setLast(long last){
-            this.last = last;
-        }
-        
-        /**
-         * Return last value
-         * 
-         * @return long last value
-         */
-        long getLast(){
-            return last;
-        }
-        
-        /**
-         * Set ledger mode 
-         *    
-         * @param mode  GENERIC or VERIFIABLE
-         */
-        void setQMode(QMode mode){
-            this.qMode = mode;
-        }
-        
-        /**
-         * Return ledger mode
-         * 
-         * @return QMode   ledger mode
-         */
-        QMode getQMode(){
-            return qMode;
-        }
-        
-        /**
-         * Set list of bookie identifiers
-         * 
-         * @param list  list of bbokie identifiers
-         */
-        void addChildren(List<String> list){
-            this.children = list;
-        }
-        
-        /**
-         * Return list of bookie identifiers
-         * 
-         * @return List<String> list of bookie identifiers
-         */
-        List<String> getChildren(){
-            return children;
-        }
-        
-        /**
-         * Returns the size of the children list. Used in processOpen.
-         * 
-         * @return int
-         */
-        int getListSize(){
-            return children.size();
-        }
-        
-        /**
-         * Sets the value of item. This is used in processOpen to
-         * keep the item value of the list of ensemble changes.
-         * 
-         * @param item
-         */
-        void setItem(String item){
-            this.item = item;
-        }
-        
-        /**
-         * Returns the value of item
-         * 
-         * @return String
-         */
-        
-        String getItem(){
-            return item;
-        }
-        
-        /**
-         * Sets the value of dataString
-         * 
-         * @param data  value to set
-         */
-        void setStringData(String data){
-            this.dataString = data;
-        }
-        
-        /**
-         * Returns the value of dataString
-         * 
-         * @return String
-         */
-        String getStringData(){
-            return dataString;
-        }
-    }
-    
-    /**
-     * Close ledger descriptor for asynchronous execution.
-     */
-    static class CloseLedgerOp extends LedgerOp {
-        private long lid;
-        private ByteBuffer last;
-        private String closePath;
-        private CloseCallback cb;
-        private Stat stat;
-        
-        /**
-         * Constructor of request to close a ledger
-         * 
-         * @param lh    ledger handle
-         */
-        CloseLedgerOp(LedgerHandle lh, 
-                CloseCallback cb,
-                Object ctx){
-            super(OpType.CLOSE, ctx);
-       
-            this.setLh(lh);
-            this.lid = lh.getId();
-            this.last = ByteBuffer.allocate(8);
-            this.last.putLong(lh.getAddConfirmed());
-            this.cb = cb;
-        }
-        
-        /**
-         * Return a ByteBuffer containing the last entry written
-         * 
-         * @return ByteBuffer identifier of last entry
-         */
-        ByteBuffer getLast(){
-            return last;
-        }
-        
-        /**
-         * Return ledger identifier
-         * 
-         * @return long
-         */
-        long getLid(){
-            return this.lid;
-        }
-        
-        /**
-         * Set close path
-         * 
-         * @param path  close path
-         */
-        void setClosePath(String path){
-            this.closePath = path;
-        }
-        
-        /**
-         * Return close path string.
-         * 
-         * @return String   close path
-         */
-        String getClosePath(){
-            return this.closePath;
-        }
-        
-        
-        /**
-         * Return callback object.
-         * 
-         * @return CloseCallback 
-         */
-        CloseCallback getCb(){
-            return this.cb;
-        }
-     
-    
-        /**
-         * Set value of stat
-         * 
-         * @param stat stat object returned by ZK callback
-         */
-        void setStat (Stat stat){
-            this.stat = stat;
-        }
-        
-        /**
-         * Return value of stat
-         * 
-         * @return Stat
-         */
-        
-        Stat getStat (){
-            return stat;
-        }
-    }
-    
-    /*
-     * BookKeeper parent.
-     */
-    BookKeeper bk;
-    /*
-     * Queue of outstanding operations
-     */
-    ArrayBlockingQueue<LedgerOp> outstandingRequests = 
-        new ArrayBlockingQueue<LedgerOp>(200);
-    
-    
-    /**
-     * Add ledger operation to queue of pending
-     * 
-     * @param op    ledger operation
-     */
-    void addOp(LedgerOp op)
-    throws InterruptedException{
-        LOG.info("Queuing new op");
-        outstandingRequests.put(op);
-    }
-    
-    /**
-     * Constructor takes BookKeeper object 
-     * 
-     * @param bk BookKeeper object
-     */
-    
-    LedgerManagementProcessor(BookKeeper bk){
-        this.bk = bk;
-    }
-    
-    /**
-     * Run method
-     */
-    public void run(){
-        while(true){
-            try{
-                LedgerOp op = outstandingRequests.take();
-            
-                switch(op.getType()){
-                case CREATE:
-                    processCreate((CreateLedgerOp) op);
-                    break;            
-                case OPEN:
-                    processOpen((OpenLedgerOp) op);
-                    break;
-                case CLOSE:
-                    processClose((CloseLedgerOp) op);
-                    break;
-                }
-            } catch(InterruptedException e){
-                LOG.warn("Interrupted while waiting in the queue of incoming requests");   
-            }
-        }
-    }
-    
-    /**
-     * Processes a create ledger operation.
-     * 
-     * @param cop   create ledger operation to process
-     * @throws InterruptedException
-     */
-    
-    private void processCreate(CreateLedgerOp cop)
-    throws InterruptedException {
-        if(cop.getRC() != BKDefs.EOK)
-            cop.getCb().createComplete(cop.getRC(), null, cop.getCtx());
-
-        switch(cop.getAction()){
-        case 0:
-            LOG.info("Action 0 of create");
-            /*
-             * Create ledger node on ZK.
-             * We get the id from the sequence number on the node.
-             */
-            bk.getZooKeeper().create(BKDefs.prefix, 
-                new byte[0], 
-                Ids.OPEN_ACL_UNSAFE, 
-                CreateMode.PERSISTENT_SEQUENTIAL,
-                this,
-                cop);
-        break;
-        case 1:
-            LOG.info("Action 1 of create");
-            /* 
-             * Extract ledger id.
-             */
-            String parts[] = cop.getPath().split("/");
-            String subparts[] = parts[2].split("L");
-            long lId = Long.parseLong(subparts[1]);
-            cop.setLid(lId);
-        
-            LedgerHandle lh = new LedgerHandle(bk, lId, 0, cop.getQuorumSize(), cop.getMode(), cop.getPasswd());
-            cop.setLh(lh);
-            /* 
-             * Get children from "/ledgers/available" on zk
-             */
-
-            bk.getZooKeeper().getChildren("/ledgers/available", false, this, cop);
-            /* 
-             * Select ensSize servers to form the ensemble
-             */
-            bk.getZooKeeper().create(BKDefs.prefix + bk.getZKStringId(lId) + BKDefs.ensemble, new byte[0], 
-                    Ids.OPEN_ACL_UNSAFE, 
-                    CreateMode.PERSISTENT,
-                    this,
-                    cop);
-            /* 
-             * Add quorum size to ZK metadata
-             */
-            ByteBuffer bb = ByteBuffer.allocate(4);
-            bb.putInt(cop.getQuorumSize());
-            
-            bk.getZooKeeper().create(BKDefs.prefix + bk.getZKStringId(lId) + cop.getQuorumSize(), 
-                    bb.array(), 
-                    Ids.OPEN_ACL_UNSAFE, 
-                    CreateMode.PERSISTENT,
-                    this,
-                    cop);
-            /* 
-             * Quorum mode
-             */
-            bb = ByteBuffer.allocate(4);
-            bb.putInt(cop.getMode().ordinal());
-            
-            bk.getZooKeeper().create(BKDefs.prefix + bk.getZKStringId(lId) + cop.getMode(), 
-                    bb.array(), 
-                    Ids.OPEN_ACL_UNSAFE, 
-                    CreateMode.PERSISTENT,
-                    this,
-                    cop);
-            break;
-        case 2:
-            LOG.info("Action 2 of create");
-            /*
-             * Adding bookies to ledger handle
-             */
-            Random r = new Random();
-            List<String> children = cop.getAvailable();
-            for(int i = 0; i < cop.getEnsembleSize(); i++){
-                int index = 0;
-                if(children.size() > 1) 
-                    index = r.nextInt(children.size() - 1);
-                else if(children.size() == 1)
-                    index = 0;
-                else {
-                    LOG.error("Not enough bookies available");    
-                    cop.setRC(BKDefs.EIB);
-                }
-            
-                try{
-                    String bookie = children.remove(index);
-                    LOG.info("Bookie: " + bookie);
-                    InetSocketAddress tAddr = bk.parseAddr(bookie);
-                    int bindex = cop.getLh().addBookieForWriting(tAddr); 
-                    ByteBuffer bindexBuf = ByteBuffer.allocate(4);
-                    bindexBuf.putInt(bindex);
-                
-                    String pBookie = "/" + bookie;
-                    cop.zkOpCounter.getAndIncrement();
-                    bk.getZooKeeper().create(BKDefs.prefix + bk.getZKStringId(cop.getLid()) + BKDefs.ensemble + pBookie, 
-                            bindexBuf.array(), 
-                            Ids.OPEN_ACL_UNSAFE, 
-                            CreateMode.PERSISTENT, 
-                            this,
-                            cop);
-                } catch (IOException e) {
-                    LOG.error(e);
-                    i--;
-                } 
-            }
-            break;
-        case 3:
-            LOG.info("Action 3 of create");
-            LOG.debug("Created new ledger");
-            cop.getCb().createComplete(cop.getRC(), cop.getLh(), cop.getCtx());   
-            break;
-        case 4:
-            break;
-        }
-    }
-        
-    /**
-     *  Processes open ledger operation.
-     * 
-     * @param oop   open ledger operation to process.
-     * @throws InterruptedException
-     */
-    private void processOpen(OpenLedgerOp oop) 
-    throws InterruptedException {    
-        /*
-         * Case for open operation
-         */
-        if(oop.getRC() != BKDefs.EOK)
-            oop.getCb().openComplete(oop.getRC(), null, oop.getCtx());
-        
-        String path;
-        LedgerHandle lh;
-        
-        switch(oop.getAction()){
-        case 0:                    
-            /*
-             * Check if ledger exists
-             */
-            bk.getZooKeeper().exists(BKDefs.prefix + bk.getZKStringId(oop.getLid()), 
-                    false,
-                    this,
-                    oop);
-            break;
-        case 1:                    
-            /*
-             * Get quorum size.
-             */
-            bk.getZooKeeper().getData(BKDefs.prefix + bk.getZKStringId(oop.getLid()) + BKDefs.quorumSize, 
-                    false,
-                    this,
-                    oop);
-            break;    
-        case 2:         
-            /*
-             * Get last entry written from ZK 
-             */
-                
-            long last = 0;
-            LOG.debug("Close path: " + BKDefs.prefix + bk.getZKStringId(oop.getLid()) + BKDefs.close);
-            bk.getZooKeeper().exists(BKDefs.prefix + bk.getZKStringId(oop.getLid()) + BKDefs.close, 
-                    false,
-                    this,
-                    oop);
-            break;
-        case 3:
-            try{
-                bk.recoverLedger(oop.getLid(), oop.getPasswd());
-            } catch(Exception e){
-                LOG.error("Cannot recover ledger", e);
-                oop.getCb().openComplete(BKDefs.ERL, null, oop.getCtx());
-            }
-            /*
-             * In the case of recovery, it falls through to the
-             * next case intentionally.
-             */
-        case 4:   
-            bk.getZooKeeper().getData(BKDefs.prefix + bk.getZKStringId(oop.getLid()) + BKDefs.close, 
-                    false, 
-                    this,
-                    oop);
-            break;
-        case 5:                
-            /*
-             * Quorum mode 
-             */
-            bk.getZooKeeper().getData(BKDefs.prefix + bk.getZKStringId(oop.getLid()) + BKDefs.quorumMode, 
-                    false, 
-                    this,
-                    oop);
-        case 6:         
-            /*
-             *  Create ledger handle
-             */
-            lh = new LedgerHandle(bk, oop.getLid(), oop.getLast(), oop.getQSize(), oop.getQMode(), oop.getPasswd());
-                
-            /*
-             * Get children of "/ledgers/id/ensemble" 
-             */
-              
-            bk.getZooKeeper().getChildren(BKDefs.prefix + bk.getZKStringId(oop.getLid()) + BKDefs.ensemble, 
-                    false,
-                    this,
-                    oop);
-            break;
-
-        case 7:
-            List<String> list = oop.getChildren();
-            LOG.info("Length of list of bookies: " + list.size());
-            try{
-                for(int i = 0 ; i < list.size() ; i++){
-                    for(String s : list){
-                        byte[] bindex = bk.getZooKeeper().getData(BKDefs.prefix + bk.getZKStringId(oop.getLid()) + BKDefs.ensemble + "/" + s, 
-                                false, new Stat());
-                        ByteBuffer bindexBuf = ByteBuffer.wrap(bindex);
-                        if(bindexBuf.getInt() == i){                      
-                            oop.getLh().addBookieForReading(bk.parseAddr(s));
-                        }
-                    }
-                }
-
-                /*
-                 * Check if there has been any change to the ensemble of bookies
-                 * due to failures.
-                 */
-                bk.getZooKeeper().exists(BKDefs.prefix + 
-                        bk.getZKStringId(oop.getLid()) +  
-                        BKDefs.quorumEvolution, 
-                                false,
-                                this,
-                                oop);
-                        
-            } catch(KeeperException e){
-                LOG.error("Exception while adding bookies", e);
-                oop.setRC(BKDefs.EZK);
-                oop.getCb().openComplete(oop.getRC(), oop.getLh(), oop.getCtx());
-            } catch(IOException e){
-                LOG.error("Exception while trying to connect to bookie");
-                oop.setRC(BKDefs.EIO);
-                oop.getCb().openComplete(oop.getRC(), oop.getLh(), oop.getCtx());
-            } 
-            
-             break;
-        
-        case 8:
-            path = BKDefs.prefix + 
-            bk.getZKStringId(oop.getLid()) +  
-            BKDefs.quorumEvolution;
-                
-            bk.getZooKeeper().getChildren(path, 
-                    false,
-                    this,
-                    oop);
-        case 9: 
-            oop.getCb().openComplete(oop.getRC(), oop.getLh(), oop.getCtx());
-            break;
-        case 10:        
-            path = BKDefs.prefix + 
-            bk.getZKStringId(oop.getLid()) +  
-            BKDefs.quorumEvolution;
-            
-            for(String s : oop.getChildren()){
-                oop.setItem(s);
-                bk.getZooKeeper().getData(path + "/" + s, 
-                        false, 
-                        this,
-                        oop);
-            }
-            
-            break;
-        case 11:
-            lh = oop.getLh();
-            
-            String parts[] = oop.getStringData().split(" ");
-
-            ArrayList<BookieHandle> newBookieSet = new ArrayList<BookieHandle>();
-            for(int i = 0 ; i < parts.length ; i++){
-                LOG.info("Address: " + parts[i]);
-                InetSocketAddress faultyBookie =  
-                    bk.parseAddr(parts[i].substring(1));                           
-        
-                newBookieSet.add(lh.getBookieHandleDup(faultyBookie));
-            }
-            lh.setNewBookieConfig(Long.parseLong(oop.getItem()), newBookieSet);
-        
-            if(oop.counter.incrementAndGet() == oop.getListSize()){
-                lh.prepareEntryChange();
-                oop.getCb().openComplete(oop.getRC(), oop.getLh(), oop.getCtx());
-            }
-            
-            break;
-        }
-    }    
-    
-    
-   /**
-    * Processes close ledger operation.
-    * 
-    * @param clop   close ledger operation to process.
-    * @throws InterruptedException
-    */
-    
-    private void processClose(CloseLedgerOp clop)
-    throws InterruptedException {
-        if(clop.getRC() != BKDefs.EOK)
-            clop.getCb().closeComplete(clop.getRC(), clop.getLh(), clop.getCtx());
-        
-        switch(clop.getAction()){
-        case 0:
-            LOG.info("Last saved on ZK is: " + clop.getLh().getLast()); 
-            clop.setClosePath(BKDefs.prefix + bk.getZKStringId(getId()) + BKDefs.close);
-            bk.getZooKeeper().exists(clop.getClosePath(), null, this, clop);
-            break;             
-        case 1:
-            if(clop.getStat() == null){
-                bk.getZooKeeper().create(clop.getClosePath(), 
-                        clop.getLast().array(), 
-                        Ids.OPEN_ACL_UNSAFE, 
-                        CreateMode.PERSISTENT, 
-                        this,
-                        clop);
-            } else {
-                bk.getZooKeeper().setData(clop.getClosePath(), 
-                        clop.getLast().array(), -1, this, clop);
-            }
-            break;
-        case 2:   
-            LedgerHandle lh = clop.getLh(); 
-            try{
-                lh.closeUp();
-                StopOp sOp = new StopOp();
-                lh.getQuorumEngine().sendOp(sOp);
-
-            } catch(Exception e) {
-                LOG.warn("Exception while stopping quorum engine: " + lh.getId());
-            }
-            clop.getCb().closeComplete(BKDefs.EOK, clop.getLh(), clop.getCtx());
-        
-            break;
-        }    
-    }
-    
-    /**
-     * Implements org.apache.zookeeper.AsyncCallback.StatCallback 
-     */
-    public void processResult(int rc, String path, Object ctx, Stat stat){
-        LedgerOp op = (LedgerOp) ctx;
-       
-        if(rc != BKDefs.EOK){
-            op.setRC(rc);
-            while(true){
-                try{
-                    this.addOp(op);
-                    return;
-                } catch(InterruptedException e) {
-                    LOG.warn("Interrupted while trying to add operation to queue", e);
-                }
-            }
-        }
-        
-        switch(op.getType()){
-        case CREATE:
-            break;
-        case OPEN:
-            switch(op.getAction()){
-            case 0:
-                if(stat == null)
-                    op.setRC(BKDefs.ENL);
-                break;
-            case 2:
-                /*
-                 * If there is no "close" znode, then we have
-                 * to recover this ledger
-                 */
-                if(stat == null)
-                    op.setAction(3);
-                else
-                    op.setAction(4);
-                break;
-            case 8:
-                if(stat == null)
-                    op.setAction(9);
-                else
-                    op.setAction(10);
-                break;
-            }
-        case CLOSE:
-            CloseLedgerOp clop = (CloseLedgerOp) op;
-            clop.setStat(stat);
-            clop.setAction(1);
-            break;
-        }
-    
-        /*
-         * Queues operation for processing
-         */
-        int counter = 0;
-        boolean leave = false;
-        while(!leave){
-            try{
-                this.addOp(op);
-                leave = true;
-            } catch(InterruptedException e) {
-                if(counter++ > MAXATTEMPTS){
-                    LOG.error("Exceed maximum number of attempts");
-                    leave = true;
-                } else
-                    LOG.warn("Interrupted while trying to add operation to queue", e);
-            }
-        }
-    
-    }   
-    
-    /**
-     * Implements org.apache.zookeeper.AsyncCallback.StringCallback 
-     */
-    public void processResult(int rc, String path, Object ctx, String name){
-        LedgerOp op = (LedgerOp) ctx;
-        
-        if(rc != BKDefs.EOK){
-            op.setRC(rc);
-        } else switch(op.getType()){
-               case CREATE:
-                   CreateLedgerOp cop = (CreateLedgerOp) op;
-
-                   int counter = cop.zkOpCounter.decrementAndGet(); 
-                   if(op.getAction() == 0){
-                       cop.setAction(1);
-                       cop.setPath(name);
-                       op.setRC(rc);               
-                   } else {
-                       if(counter == 0){
-                           cop.setAction(3);
-                       } else {
-                           /*
-                            * Could queue a no-op, but for optimization 
-                            * purposes, let's return here
-                            */
-                           return;
-                       }
-
-                   }
-                   op = cop;
-                   break;
-               case OPEN:
-                   break;
-               case CLOSE:
-                   CloseLedgerOp clop = (CloseLedgerOp) op;
-                   clop.setAction(1);
-                   break;
-        }
-        
-        /*
-         * Queues operation for processing 
-         */
-        int counter = 0;
-        boolean leave = false;
-        while(!leave){
-            try{
-                this.addOp(op);
-                leave = true;
-            } catch(InterruptedException e) {
-                if(counter++ > MAXATTEMPTS){
-                    LOG.error("Exceed maximum number of attempts");
-                    leave = true;
-                } else
-                    LOG.warn("Interrupted while trying to add operation to queue", e);
-            }
-        }
-        LOG.info("Leaving loop");
-    }
-    
-    /**
-     * Implement org.apache.zookeeper.AsyncCallback.ChildrenCallback
-     */
-    public void processResult(int rc, String path, Object ctx, List<String> children){
-       LedgerOp op = (LedgerOp) ctx;
-       
-       LOG.info("Processing children callback");
-       if(rc != BKDefs.EOK){
-           op.setRC(rc);
-       } else switch(op.getType()){
-              case CREATE:
-                  CreateLedgerOp cop = (CreateLedgerOp) op;
-                  cop.addAvailable(children);
-                  int counter = cop.zkOpCounter.decrementAndGet();
-                  LOG.info("ZK Op counter value: " + counter);
-                  cop.setAction(2);
-                  
-                  op = cop;
-                  break;
-              case OPEN:
-                  OpenLedgerOp oop = (OpenLedgerOp) op;
-                  oop.addChildren(children);
-                  break;
-       }
-       
-       int counter = 0;
-       boolean leave = false;
-       while(!leave){
-           try{
-               this.addOp(op);
-               leave = true;
-           } catch(InterruptedException e) {
-               if(counter++ > MAXATTEMPTS){
-                   LOG.error("Exceed maximum number of attempts");
-                   leave = true;
-               } else
-                   LOG.warn("Interrupted while trying to add operation to queue", e);
-           }
-       }
-    }
-    
-    /**
-     * Implement org.apache.zookeeper.AsyncCallback.DataCallback
-     */
-    public void processResult(int rc, String path, Object ctx, byte[] data, Stat stat){
-        LedgerOp op = (LedgerOp) ctx;
-        ByteBuffer bb;
-        
-        if(rc != BKDefs.EOK){
-            op.setRC(rc);
-        } else switch(op.getType()){
-               case OPEN:
-                   OpenLedgerOp oop = (OpenLedgerOp) op;
-                   switch(oop.getAction()){
-                   case 1: 
-                       oop.setQSize(data);
-                       break;
-                   case 4:
-                       bb = ByteBuffer.wrap(data);
-                       oop.setLast(bb.getLong());
-                       break;
-                   case 5:
-                       bb = ByteBuffer.wrap(data);
-                       
-                       switch(bb.getInt()){
-                       case 1:
-                           oop.setQMode(QMode.GENERIC);
-                           LOG.info("Generic ledger");
-                           break;
-                       case 2:
-                           oop.setQMode(QMode.FREEFORM);
-                           break;
-                       default:
-                           oop.setQMode(QMode.VERIFIABLE);
-                       LOG.info("Verifiable ledger");
-                       }
-                       break;
-                   case 10:
-                       String addr = new String(data);
-                       oop.setStringData(addr);
-                       oop.setAction(11);
-                       break;
-                   }
-                   break;
-               default:
-                   LOG.warn("Wrong type");
-                   break;  
-        }
-        
-        int counter = 0;
-        boolean leave = false;
-        while(!leave){
-            try{
-                this.addOp(op);
-                leave = true;
-            } catch(InterruptedException e) {
-                if(counter++ > MAXATTEMPTS){
-                    LOG.error("Exceed maximum number of attempts");
-                    leave = true;
-                } else
-                    LOG.warn("Interrupted while trying to add operation to queue", e);
-            }
-        }
-    }
-}

+ 179 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerMetadata.java

@@ -0,0 +1,179 @@
+package org.apache.bookkeeper.client;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.bookkeeper.util.StringUtils;
+import org.apache.log4j.Logger;
+
+/**
+ * This class encapsulates all the ledger metadata that is persistently stored
+ * in zookeeper. It provides parsing and serialization methods of such metadata.
+ * 
+ */
+class LedgerMetadata {
+    static final Logger LOG = Logger.getLogger(LedgerMetadata.class);
+
+    private static final String closed = "CLOSED";
+    private static final String lSplitter = "\n";
+    private static final String tSplitter = "\t";
+
+    // can't use -1 for NOTCLOSED because that is reserved for a closed, empty
+    // ledger
+    public static final int NOTCLOSED = -101;
+    int ensembleSize;
+    int quorumSize;
+    long close;
+    private SortedMap<Long, ArrayList<InetSocketAddress>> ensembles = new TreeMap<Long, ArrayList<InetSocketAddress>>();
+    ArrayList<InetSocketAddress> currentEnsemble;
+
+    public LedgerMetadata(int ensembleSize, int quorumSize) {
+        this.ensembleSize = ensembleSize;
+        this.quorumSize = quorumSize;
+        this.close = NOTCLOSED;
+    };
+
+    private LedgerMetadata() {
+        this(0, 0);
+    }
+
+    boolean isClosed() {
+        return close != NOTCLOSED;
+    }
+
+    void close(long entryId) {
+        close = entryId;
+    }
+
+    void addEnsemble(long startEntryId, ArrayList<InetSocketAddress> ensemble) {
+        assert ensembles.isEmpty() || startEntryId >= ensembles.lastKey();
+
+        ensembles.put(startEntryId, ensemble);
+        currentEnsemble = ensemble;
+    }
+
+    ArrayList<InetSocketAddress> getEnsemble(long entryId) {
+        // the head map cannot be empty, since we insert an ensemble for
+        // entry-id 0, right when we start
+        return ensembles.get(ensembles.headMap(entryId + 1).lastKey());
+    }
+
+    /**
+     * the entry id > the given entry-id at which the next ensemble change takes
+     * place ( -1 if no further ensemble changes)
+     * 
+     * @param entryId
+     * @return
+     */
+    long getNextEnsembleChange(long entryId) {
+        SortedMap<Long, ArrayList<InetSocketAddress>> tailMap = ensembles.tailMap(entryId + 1);
+
+        if (tailMap.isEmpty()) {
+            return -1;
+        } else {
+            return tailMap.firstKey();
+        }
+    }
+
+    /**
+     * Generates a byte array based on a LedgerConfig object received.
+     * 
+     * @param config
+     *            LedgerConfig object
+     * @return byte[]
+     */
+    public byte[] serialize() {
+        StringBuilder s = new StringBuilder();
+        s.append(quorumSize).append(lSplitter).append(ensembleSize);
+
+        for (Map.Entry<Long, ArrayList<InetSocketAddress>> entry : ensembles.entrySet()) {
+            s.append(lSplitter).append(entry.getKey());
+            for (InetSocketAddress addr : entry.getValue()) {
+                s.append(tSplitter);
+                StringUtils.addrToString(s, addr);
+            }
+        }
+
+        if (close != NOTCLOSED) {
+            s.append(lSplitter).append(close).append(tSplitter).append(closed);
+        }
+
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("Serialized config: " + s.toString());
+        }
+
+        return s.toString().getBytes();
+    }
+
+    /**
+     * Parses a given byte array and transforms into a LedgerConfig object
+     * 
+     * @param array
+     *            byte array to parse
+     * @return LedgerConfig
+     * @throws IOException
+     *             if the given byte[] cannot be parsed
+     */
+
+    static LedgerMetadata parseConfig(byte[] bytes) throws IOException {
+
+        LedgerMetadata lc = new LedgerMetadata();
+        String config = new String(bytes);
+
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("Parsing Config: " + config);
+        }
+
+        String lines[] = config.split(lSplitter);
+
+        if (lines.length < 2) {
+            throw new IOException("Quorum size or ensemble size absent from config: " + config);
+        }
+
+        try {
+            lc.quorumSize = new Integer(lines[0]);
+            lc.ensembleSize = new Integer(lines[1]);
+
+            for (int i = 2; i < lines.length; i++) {
+                String parts[] = lines[i].split(tSplitter);
+
+                if (parts[1].equals(closed)) {
+                    lc.close = new Long(parts[0]);
+                    break;
+                }
+
+                ArrayList<InetSocketAddress> addrs = new ArrayList<InetSocketAddress>();
+                for (int j = 1; j < parts.length; j++) {
+                    addrs.add(StringUtils.parseAddr(parts[j]));
+                }
+                lc.addEnsemble(new Long(parts[0]), addrs);
+            }
+        } catch (NumberFormatException e) {
+            throw new IOException(e);
+        }
+        return lc;
+    }
+
+}

+ 136 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerOpenOp.java

@@ -0,0 +1,136 @@
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+package org.apache.bookkeeper.client;
+
+import java.io.IOException;
+import java.security.GeneralSecurityException;
+import org.apache.bookkeeper.client.AsyncCallback.OpenCallback;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
+import org.apache.bookkeeper.util.StringUtils;
+import org.apache.log4j.Logger;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.AsyncCallback.DataCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.GenericCallback;
+import org.apache.zookeeper.data.Stat;
+
+/**
+ * Encapsulates the ledger open operation
+ * 
+ */
+class LedgerOpenOp implements DataCallback {
+    static final Logger LOG = Logger.getLogger(LedgerOpenOp.class);
+    
+    BookKeeper bk;
+    long ledgerId;
+    OpenCallback cb;
+    Object ctx;
+    LedgerHandle lh;
+    byte[] passwd;
+    DigestType digestType;
+    
+    /**
+     * Constructor.
+     * 
+     * @param bk
+     * @param ledgerId
+     * @param digestType
+     * @param passwd
+     * @param cb
+     * @param ctx
+     */
+    
+    public LedgerOpenOp(BookKeeper bk, long ledgerId, DigestType digestType, byte[] passwd, OpenCallback cb, Object ctx) {
+        this.bk = bk;
+        this.ledgerId = ledgerId;
+        this.passwd = passwd;
+        this.cb = cb;
+        this.ctx = ctx;
+        this.digestType = digestType;
+    }
+
+    /**
+     * Inititates the ledger open operation
+     */
+    public void initiate() {
+        /**
+         * Asynchronously read the ledger metadata node.
+         */
+
+        bk.getZkHandle().getData(StringUtils.getLedgerNodePath(ledgerId), false, this, ctx);
+
+    }
+
+    /**
+     * Implements ZooKeeper data callback.
+     * @see org.apache.zookeeper.AsyncCallback.DataCallback#processResult(int, String, Object, byte[], Stat)
+     */
+    public void processResult(int rc, String path, Object ctx, byte[] data, Stat stat) {
+
+        if (rc == KeeperException.Code.NONODE.intValue()) {
+            if (LOG.isDebugEnabled()) {
+                LOG.debug("No such ledger: " + ledgerId, KeeperException.create(KeeperException.Code.get(rc), path));
+            }
+            cb.openComplete(BKException.Code.NoSuchLedgerExistsException, null, this.ctx);
+            return;
+        }
+        if (rc != KeeperException.Code.OK.intValue()) {
+            LOG.error("Could not read metadata for ledger: " + ledgerId, KeeperException.create(KeeperException.Code
+                    .get(rc), path));
+            cb.openComplete(BKException.Code.ZKException, null, this.ctx);
+            return;
+        }
+
+        LedgerMetadata metadata;
+        try {
+            metadata = LedgerMetadata.parseConfig(data);
+        } catch (IOException e) {
+            LOG.error("Could not parse ledger metadata for ledger: " + ledgerId, e);
+            cb.openComplete(BKException.Code.ZKException, null, this.ctx);
+            return;
+        }
+
+        try {
+            lh = new LedgerHandle(bk, ledgerId, metadata, digestType, passwd);
+        } catch (GeneralSecurityException e) {
+            LOG.error("Security exception while opening ledger: " + ledgerId, e);
+            cb.openComplete(BKException.Code.DigestNotInitializedException, null, this.ctx);
+            return;
+        }
+
+        if (metadata.close != LedgerMetadata.NOTCLOSED) {
+            // Ledger was closed properly
+            cb.openComplete(BKException.Code.OK, lh, this.ctx);
+            return;
+        }
+
+        lh.recover(new GenericCallback<Void>() {
+            @Override
+            public void operationComplete(int rc, Void result) {
+                if (rc != BKException.Code.OK) {
+                    cb.openComplete(BKException.Code.LedgerRecoveryException, null, LedgerOpenOp.this.ctx);
+                } else {
+                    cb.openComplete(BKException.Code.OK, lh, LedgerOpenOp.this.ctx);
+                }
+            }
+        });
+    }
+}

+ 0 - 245
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerRecoveryMonitor.java

@@ -1,245 +0,0 @@
-package org.apache.bookkeeper.client;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.lang.Math;
-import java.lang.InterruptedException;
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.HashMap;
-import java.util.TreeMap;
-
-//import org.apache.bookkeeper.client.AsyncCallback.FailCallback;
-import org.apache.bookkeeper.client.BKException;
-import org.apache.bookkeeper.client.BookKeeper;
-import org.apache.bookkeeper.client.LedgerSequence;
-import org.apache.bookkeeper.client.LedgerHandle.QMode;
-import org.apache.bookkeeper.proto.BookieClient;
-import org.apache.bookkeeper.proto.ReadEntryCallback;
-import org.apache.log4j.Logger;
-
-import org.apache.zookeeper.KeeperException;
-
-/**
- * Implements the mechanism to recover a ledger that was not closed 
- * correctly. It reads entries from the ledger using the hint field
- * until it finds the last entry written. It then writes to ZooKeeper. 
- * 
- */
-
-class LedgerRecoveryMonitor implements ReadEntryCallback {
-    Logger LOG = Logger.getLogger(LedgerRecoveryMonitor.class);
-    
-    BookKeeper self;
-    long lId;
-    int qSize;
-    QMode qMode;
-    ArrayList<InetSocketAddress> bookies;
-    ArrayList<BookieClient> clients;
-    HashMap<Long, ArrayList<ByteBuffer> > votes;
-    TreeMap<Long, Integer > hints;
-    AtomicInteger counter;
-    
-    private int minimum;
-    
-    /**
-     * Constructor simply initiates data structures
-     * 
-     * @param self  Instance of BookKeeper
-     * @param lId   Ledger identifier
-     * @param qSize Quorum size
-     * @param bookies   List of bookie addresses
-     * @param qMode     Quorum mode
-     */
-    LedgerRecoveryMonitor(BookKeeper self,
-            long lId, 
-            int qSize, 
-            ArrayList<InetSocketAddress> bookies, 
-            QMode qMode){
-        this.self = self;
-        this.lId = lId;
-        this.qSize = qSize;
-        this.qMode = qMode;
-        this.bookies = bookies;
-        this.clients = new ArrayList<BookieClient>();
-        this.votes = new HashMap<Long, ArrayList<ByteBuffer> >();
-        this.hints = new TreeMap<Long, Integer>();
-        this.counter = new AtomicInteger(0);
-        
-        this.minimum = bookies.size();
-        if(qMode == QMode.VERIFIABLE){
-            this.minimum += 1 - qSize; 
-        } else if(qMode == QMode.GENERIC){
-            this.minimum -= Math.floor(qSize/2);
-        } 
-        
-    }
-    
-    
-    /**
-     * Determines the last entry written to a ledger not closed properly
-     * due to a client crash
-     * 
-     * @param   passwd  
-     */
-    boolean recover(byte[] passwd) throws 
-    IOException, InterruptedException, BKException, KeeperException {
-        /*
-         * Create BookieClient objects and send a request to each one.
-         */
-        
-        for(InetSocketAddress s : bookies){
-            LOG.info(s);
-            BookieClient client = new BookieClient(s, 3000);
-            clients.add(client);
-            client.readEntry(lId,
-                    -1,
-                    this,
-                    null);
-        }        
-        
-        /*
-         * Wait until I have received enough responses
-         */
-        synchronized(counter){
-            LOG.info("Counter: " + counter.get() + ", " + minimum + ", " + qMode);
-            if(counter.get() < minimum){
-                LOG.info("Waiting...");
-                counter.wait(5000);
-            }
-        }
-        
-        /*
-         * Obtain largest hint 
-         */ 
-        LedgerHandle lh = new LedgerHandle(self, lId, 0, qSize, qMode, passwd);
-        for(InetSocketAddress addr : bookies){
-            lh.addBookieForReading(addr);
-        }
-        
-        boolean notLegitimate = true;
-        long readCounter = 0;
-        while(notLegitimate){
-            readCounter = getNextHint();
-            if(readCounter > -1){
-                lh.setLast(readCounter);
-                boolean hasMore = true;
-                while(hasMore){
-                    hasMore = false;
-                    LOG.debug("Recovering: " + lh.getLast());
-                    LedgerSequence ls = lh.readEntries(lh.getLast(), lh.getLast());
-                    LOG.debug("Received entry for: " + lh.getLast());
-                    
-                    byte[] le = ls.nextElement().getEntry();
-                    if(le != null){
-                        if(notLegitimate) notLegitimate = false;
-                        lh.addEntry(le);
-                        hasMore = true;
-                    }
-                }
-            } else break;   
-        }
-        
-        /*
-         * Write counter as the last entry of ledger
-         */
-        if(!notLegitimate){
-            lh.setAddConfirmed(readCounter);
-            lh.close();
-            
-            return true;
-        } else {
-        	lh.setLast(0);
-        	lh.close();
-        	
-        	return false;
-        }
-                
-    }
-    
-    /**
-     * Read callback implementation
-     * 
-     * @param rc    return code
-     * @param ledgerId  Ledger identifier
-     * @param entryId   Entry identifier
-     * @param bb        Data
-     * @param ctx       Control object
-     * 
-     */
-    public void readEntryComplete(int rc, long ledgerId, long entryId, ByteBuffer bb, Object ctx){
-        if(rc == 0){
-            bb.rewind();
-        
-            /*
-             * Collect new vote
-             */
-            if(!votes.containsKey(entryId)){            
-                votes.put(entryId, new ArrayList<ByteBuffer>());
-            }
-            votes.get(entryId).add(bb);
-         
-            /*
-             * Extract hint
-             */
-        
-            bb.position(16);
-            long hint = bb.getLong();
-        
-            LOG.info("Received a response: " + rc + ", " + entryId + ", " + hint);
-        
-            if(!hints.containsKey(hint)){
-                hints.put(hint, 0);
-            }
-            hints.put(hint, hints.get(hint) + 1);
-        
-            synchronized(counter){
-                if(counter.incrementAndGet() >= minimum);
-                counter.notify();
-            }
-        } else {
-            LOG.debug("rc != 0");
-        }
-        
-    }
-    
-    /**
-     * Returns one hint at a time. We add a new hint to
-     * the "hints" TreeMap used in this method upon a read
-     * callback. Such callbacks correspond to returned values from bookies upon a request
-     * for the last entry written hint.
-     * 
-     * @return long next hint
-     */
-    private long getNextHint(){
-        if(hints.size() == 0) return -1;
-        
-        long hint = hints.lastKey();
-        hints.remove(hint);
-        
-        return hint;
-    }
-    
-}

+ 167 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerRecoveryOp.java

@@ -0,0 +1,167 @@
+package org.apache.bookkeeper.client;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Enumeration;
+
+import org.apache.bookkeeper.client.AsyncCallback.AddCallback;
+import org.apache.bookkeeper.client.AsyncCallback.ReadCallback;
+import org.apache.bookkeeper.client.BKException.BKDigestMatchException;
+import org.apache.bookkeeper.client.LedgerHandle.NoopCloseCallback;
+import org.apache.bookkeeper.client.DigestManager.RecoveryData;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.ReadEntryCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.GenericCallback;
+import org.apache.log4j.Logger;
+import org.jboss.netty.buffer.ChannelBuffer;
+
+/**
+ * This class encapsulated the ledger recovery operation. It first does a read
+ * with entry-id of -1 to all bookies. Then starting from the last confirmed
+ * entry (from hints in the ledger entries), it reads forward until it is not
+ * able to find a particular entry. It closes the ledger at that entry.
+ * 
+ */
+class LedgerRecoveryOp implements ReadEntryCallback, ReadCallback, AddCallback {
+    static final Logger LOG = Logger.getLogger(LedgerRecoveryOp.class);
+    LedgerHandle lh;
+    int numResponsesPending;
+    boolean proceedingWithRecovery = false;
+    long maxAddPushed = -1;
+    long maxAddConfirmed = -1;
+
+    GenericCallback<Void> cb;
+
+    public LedgerRecoveryOp(LedgerHandle lh, GenericCallback<Void> cb) {
+        this.cb = cb;
+        this.lh = lh;
+        numResponsesPending = lh.metadata.ensembleSize;
+    }
+
+    public void initiate() {
+        for (int i = 0; i < lh.metadata.currentEnsemble.size(); i++) {
+            lh.bk.bookieClient.readEntry(lh.metadata.currentEnsemble.get(i), lh.ledgerId, -1, this, i);
+        }
+    }
+
+    public synchronized void readEntryComplete(final int rc, final long ledgerId, final long entryId,
+            final ChannelBuffer buffer, final Object ctx) {
+
+        // Already proceeding with recovery, nothing to do
+        if (proceedingWithRecovery) {
+            return;
+        }
+
+        int bookieIndex = (Integer) ctx;
+
+        numResponsesPending--;
+
+        boolean heardValidResponse = false;
+
+        if (rc == BKException.Code.OK) {
+            try {
+                RecoveryData recoveryData = lh.macManager.verifyDigestAndReturnLastConfirmed(buffer);
+                maxAddConfirmed = Math.max(maxAddConfirmed, recoveryData.lastAddConfirmed);
+                maxAddPushed = Math.max(maxAddPushed, recoveryData.entryId);
+                heardValidResponse = true;
+            } catch (BKDigestMatchException e) {
+                // Too bad, this bookie didnt give us a valid answer, we
+                // still
+                // might be able to recover though so continue
+                LOG.error("Mac mismatch while reading last entry from bookie: "
+                        + lh.metadata.currentEnsemble.get(bookieIndex));
+            }
+        }
+
+        if (rc == BKException.Code.NoSuchLedgerExistsException || rc == BKException.Code.NoSuchEntryException) {
+            // this still counts as a valid response, e.g., if the
+            // client
+            // crashed without writing any entry
+            heardValidResponse = true;
+        }
+
+        // other return codes dont count as valid responses
+        if (heardValidResponse && lh.distributionSchedule.canProceedWithRecovery(bookieIndex)) {
+            proceedingWithRecovery = true;
+            lh.lastAddPushed = lh.lastAddConfirmed = maxAddConfirmed;
+            doRecoveryRead();
+            return;
+        }
+
+        if (numResponsesPending == 0) {
+            // Have got all responses back but was still not enough to
+            // start
+            // recovery, just fail the operation
+            LOG.error("While recovering ledger: " + ledgerId + " did not hear success responses from all quorums");
+            cb.operationComplete(BKException.Code.LedgerRecoveryException, null);
+        }
+
+    }
+
+    /**
+     * Try to read past the last confirmed.
+     */
+    private void doRecoveryRead() {
+        lh.lastAddConfirmed++;
+        lh.asyncReadEntries(lh.lastAddConfirmed, lh.lastAddConfirmed, this, null);
+
+    }
+
+    @Override
+    public void readComplete(int rc, LedgerHandle lh, Enumeration<LedgerEntry> seq, Object ctx) {
+        // get back to prev value
+        lh.lastAddConfirmed--;
+        if (rc == BKException.Code.OK) {
+            lh.asyncAddEntry(seq.nextElement().getEntry(), this, null);
+            return;
+        }
+
+        if (rc == BKException.Code.NoSuchEntryException || rc == BKException.Code.NoSuchLedgerExistsException) {
+            lh.asyncClose(NoopCloseCallback.instance, null);
+            // we don't need to wait for the close to complete. Since we mark
+            // the
+            // ledger closed in memory, the application wont be able to add to
+            // it
+
+            cb.operationComplete(BKException.Code.OK, null);
+            return;
+        }
+
+        // otherwise, some other error, we can't handle
+        LOG.error("Failure " + BKException.getMessage(rc) + " while reading entry: " + lh.lastAddConfirmed + 1
+                + " ledger: " + lh.ledgerId + " while recovering ledger");
+        cb.operationComplete(rc, null);
+        return;
+    }
+
+    @Override
+    public void addComplete(int rc, LedgerHandle lh, long entryId, Object ctx) {
+        if (rc != BKException.Code.OK) {
+            // Give up, we can't recover from this error
+
+            LOG.error("Failure " + BKException.getMessage(rc) + " while writing entry: " + lh.lastAddConfirmed + 1
+                    + " ledger: " + lh.ledgerId + " while recovering ledger");
+            cb.operationComplete(rc, null);
+            return;
+        }
+
+        doRecoveryRead();
+
+    }
+
+}

+ 0 - 65
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/LedgerSequence.java

@@ -1,65 +0,0 @@
-package org.apache.bookkeeper.client;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.util.Enumeration;
-import java.util.List;
-import java.util.Arrays;
-import java.util.NoSuchElementException;
-
-import org.apache.log4j.Logger;
-
-/**
- * Sequence of entries of a ledger. Used to return a sequence of entries
- * upon an asynchornous read call.
- *
- */
-
-
-public class LedgerSequence 
-implements Enumeration<LedgerEntry> {
-    Logger LOG = Logger.getLogger(LedgerSequence.class);
-    
-    int index = 0;
-    List<LedgerEntry> seq;
-    
-    LedgerSequence(LedgerEntry[] seq){
-        this.seq = Arrays.asList(seq);
-        LOG.debug("Sequence provided: " + this.seq.size());
-    }
-    
-    public boolean hasMoreElements(){
-        if(index < seq.size())
-            return true;
-        else
-            return false;
-    }
-    
-    public LedgerEntry nextElement() throws NoSuchElementException{
-        LOG.debug("Next element of sequence: " + seq.size() + ", " + index);
-        return seq.get(index++);
-    }
-    
-    public int size(){
-        return seq.size();
-    }
-}

+ 67 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/MacDigestManager.java

@@ -0,0 +1,67 @@
+package org.apache.bookkeeper.client;
+
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+import java.security.GeneralSecurityException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+import javax.crypto.Mac;
+import javax.crypto.spec.SecretKeySpec;
+
+class MacDigestManager extends DigestManager {
+    public static String DIGEST_ALGORITHM = "SHA-1";
+    public static String KEY_ALGORITHM = "HmacSHA1";
+    Mac mac;
+
+    public MacDigestManager(long ledgerId, byte[] passwd) throws GeneralSecurityException {
+        super(ledgerId);
+        byte[] macKey = genDigest("mac", passwd);
+        SecretKeySpec keySpec = new SecretKeySpec(macKey, KEY_ALGORITHM);
+        mac = Mac.getInstance(KEY_ALGORITHM);
+        mac.init(keySpec);
+        
+        
+    }
+
+    static byte[] genDigest(String pad, byte[] passwd) throws NoSuchAlgorithmException {
+        MessageDigest digest = MessageDigest.getInstance(DIGEST_ALGORITHM);
+        digest.update(pad.getBytes());
+        digest.update(passwd);
+                return digest.digest();
+    }
+
+    @Override
+    int getMacCodeLength() {
+        return 20;
+    }
+
+    
+    @Override
+    byte[] getValueAndReset() {
+        return mac.doFinal();
+    }
+    
+    @Override
+    void update(byte[] data, int offset, int length) {
+        mac.update(data, offset, length);
+    }
+    
+    
+}

+ 137 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/PendingAddOp.java

@@ -0,0 +1,137 @@
+package org.apache.bookkeeper.client;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.net.InetSocketAddress;
+import org.apache.bookkeeper.client.AsyncCallback.AddCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.WriteCallback;
+import org.apache.log4j.Logger;
+import org.jboss.netty.buffer.ChannelBuffer;
+
+/**
+ * This represents a pending add operation. When it has got success from all
+ * bookies, it sees if its at the head of the pending adds queue, and if yes,
+ * sends ack back to the application. If a bookie fails, a replacement is made
+ * and placed at the same position in the ensemble. The pending adds are then
+ * rereplicated.
+ * 
+ * 
+ */
+class PendingAddOp implements WriteCallback {
+    final static Logger LOG = Logger.getLogger(PendingAddOp.class);
+
+    ChannelBuffer toSend;
+    AddCallback cb;
+    Object ctx;
+    long entryId;
+    boolean[] successesSoFar;
+    int numResponsesPending;
+    LedgerHandle lh;
+
+    PendingAddOp(LedgerHandle lh, AddCallback cb, Object ctx, long entryId) {
+        this.lh = lh;
+        this.cb = cb;
+        this.ctx = ctx;
+        this.entryId = entryId;
+        successesSoFar = new boolean[lh.metadata.quorumSize];
+        numResponsesPending = successesSoFar.length;
+    }
+
+    void sendWriteRequest(int bookieIndex, int arrayIndex) {
+        lh.bk.bookieClient.addEntry(lh.metadata.currentEnsemble.get(bookieIndex), lh.ledgerId, lh.ledgerKey, entryId, toSend,
+                this, arrayIndex);
+    }
+
+    void unsetSuccessAndSendWriteRequest(int bookieIndex) {
+        if (toSend == null) {
+            // this addOp hasn't yet had its mac computed. When the mac is
+            // computed, its write requests will be sent, so no need to send it
+            // now
+            return;
+        }
+
+        int replicaIndex = lh.distributionSchedule.getReplicaIndex(entryId, bookieIndex);
+        if (replicaIndex < 0) {
+            if (LOG.isDebugEnabled()) {
+                LOG.debug("Leaving unchanged, ledger: " + lh.ledgerId + " entry: " + entryId + " bookie index: "
+                        + bookieIndex);
+            }
+            return;
+        }
+
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("Unsetting success for ledger: " + lh.ledgerId + " entry: " + entryId + " bookie index: "
+                    + bookieIndex);
+        }
+
+        // if we had already heard a success from this array index, need to
+        // increment our number of responses that are pending, since we are
+        // going to unset this success
+        if (successesSoFar[replicaIndex]) {
+            successesSoFar[replicaIndex] = false;
+            numResponsesPending++;
+        }
+        
+         sendWriteRequest(bookieIndex, replicaIndex);
+    }
+
+    void initiate(ChannelBuffer toSend) {
+        this.toSend = toSend;
+        for (int i = 0; i < successesSoFar.length; i++) {
+            int bookieIndex = lh.distributionSchedule.getBookieIndex(entryId, i);
+            sendWriteRequest(bookieIndex, i);
+        }
+    }
+
+    @Override
+    public void writeComplete(int rc, long ledgerId, long entryId, InetSocketAddress addr, Object ctx) {
+
+        Integer replicaIndex = (Integer) ctx;
+        int bookieIndex = lh.distributionSchedule.getBookieIndex(entryId, replicaIndex);
+
+        if (!lh.metadata.currentEnsemble.get(bookieIndex).equals(addr)) {
+            // ensemble has already changed, failure of this addr is immaterial
+            LOG.warn("Write did not succeed: " + ledgerId + ", " + entryId + ". But we have already fixed it.");
+            return;
+        }
+        
+        if (rc != BKException.Code.OK) {
+            LOG.warn("Write did not succeed: " + ledgerId + ", " + entryId);
+            lh.handleBookieFailure(addr, bookieIndex);
+            return;
+        }
+
+
+        if (!successesSoFar[replicaIndex]) {
+            successesSoFar[replicaIndex] = true;
+            numResponsesPending--;
+            
+            // do some quick checks to see if some adds may have finished. All
+            // this will be checked under locks again
+            if (numResponsesPending == 0 && lh.pendingAddOps.peek() == this) {
+                lh.sendAddSuccessCallbacks();
+            }
+        } 
+    }
+
+    void submitCallback(final int rc) {
+        cb.addComplete(rc, lh, entryId, ctx);
+    }
+
+}

+ 145 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/PendingReadOp.java

@@ -0,0 +1,145 @@
+package org.apache.bookkeeper.client;
+
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+import java.net.InetSocketAddress;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.NoSuchElementException;
+import java.util.Queue;
+import org.apache.bookkeeper.client.AsyncCallback.ReadCallback;
+import org.apache.bookkeeper.client.BKException.BKDigestMatchException;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.ReadEntryCallback;
+import org.apache.log4j.Logger;
+import org.jboss.netty.buffer.ChannelBuffer;
+import org.jboss.netty.buffer.ChannelBufferInputStream;
+
+/**
+ * Sequence of entries of a ledger that represents a pending read operation.
+ * When all the data read has come back, the application callback is called.
+ * This class could be improved because we could start pushing data to the
+ * application as soon as it arrives rather than waiting for the whole thing.
+ * 
+ */
+
+class PendingReadOp implements Enumeration<LedgerEntry>, ReadEntryCallback {
+    Logger LOG = Logger.getLogger(PendingReadOp.class);
+
+    Queue<LedgerEntry> seq;
+    ReadCallback cb;
+    Object ctx;
+    LedgerHandle lh;
+    long numPendingReads;
+    long startEntryId;
+    long endEntryId;
+
+    PendingReadOp(LedgerHandle lh, long startEntryId, long endEntryId, ReadCallback cb, Object ctx) {
+
+        seq = new ArrayDeque<LedgerEntry>((int) (endEntryId - startEntryId));
+        this.cb = cb;
+        this.ctx = ctx;
+        this.lh = lh;
+        this.startEntryId = startEntryId;
+        this.endEntryId = endEntryId;
+        numPendingReads = endEntryId - startEntryId + 1;
+    }
+
+    public void initiate() {
+        long nextEnsembleChange = startEntryId, i = startEntryId;
+
+        ArrayList<InetSocketAddress> ensemble = null;
+        do {
+
+            if (i == nextEnsembleChange) {
+                ensemble = lh.metadata.getEnsemble(i);
+                nextEnsembleChange = lh.metadata.getNextEnsembleChange(i);
+            }
+            LedgerEntry entry = new LedgerEntry(lh.ledgerId, i);
+            seq.add(entry);
+            i++;
+            sendRead(ensemble, entry, BKException.Code.ReadException);
+
+        } while (i <= endEntryId);
+
+    }
+
+    void sendRead(ArrayList<InetSocketAddress> ensemble, LedgerEntry entry, int lastErrorCode) {
+        if (entry.nextReplicaIndexToReadFrom >= lh.metadata.quorumSize) {
+            // we are done, the read has failed from all replicas, just fail the
+            // read
+            cb.readComplete(lastErrorCode, lh, null, ctx);
+            return;
+        }
+
+        int bookieIndex = lh.distributionSchedule.getBookieIndex(entry.entryId, entry.nextReplicaIndexToReadFrom);
+        entry.nextReplicaIndexToReadFrom++;
+        lh.bk.bookieClient.readEntry(ensemble.get(bookieIndex), lh.ledgerId, entry.entryId, this, entry);
+    }
+
+    void logErrorAndReattemptRead(LedgerEntry entry, String errMsg, int rc) {
+        ArrayList<InetSocketAddress> ensemble = lh.metadata.getEnsemble(entry.entryId);
+        int bookeIndex = lh.distributionSchedule.getBookieIndex(entry.entryId, entry.nextReplicaIndexToReadFrom - 1);
+        LOG.error(errMsg + " while reading entry: " + entry.entryId + " ledgerId: " + lh.ledgerId + " from bookie: "
+                + ensemble.get(bookeIndex));
+        sendRead(ensemble, entry, rc);
+        return;
+    }
+
+    @Override
+    public void readEntryComplete(int rc, long ledgerId, final long entryId, final ChannelBuffer buffer, Object ctx) {
+        final LedgerEntry entry = (LedgerEntry) ctx;
+
+        if (rc != BKException.Code.OK) {
+            logErrorAndReattemptRead(entry, "Error: " + BKException.getMessage(rc), rc);
+            return;
+        }
+        
+        numPendingReads--;
+        ChannelBufferInputStream is;
+        try {
+            is = lh.macManager.verifyDigestAndReturnData(entryId, buffer);
+        } catch (BKDigestMatchException e) {
+            logErrorAndReattemptRead(entry, "Mac mismatch", BKException.Code.DigestMatchException);
+            return;
+        }
+
+        entry.entryDataStream = is;
+
+        if (numPendingReads == 0) {
+            cb.readComplete(BKException.Code.OK, lh, PendingReadOp.this, PendingReadOp.this.ctx);
+        }
+
+    }
+
+    public boolean hasMoreElements() {
+        return !seq.isEmpty();
+    }
+
+    public LedgerEntry nextElement() throws NoSuchElementException {
+        return seq.remove();
+    }
+
+    public int size() {
+        return seq.size();
+    }
+}

+ 0 - 299
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/QuorumEngine.java

@@ -1,299 +0,0 @@
-package org.apache.bookkeeper.client;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.bookkeeper.client.BKException;
-import org.apache.bookkeeper.client.BKException.Code;
-import org.apache.bookkeeper.client.ClientCBWorker;
-import org.apache.bookkeeper.client.QuorumOpMonitor;
-import org.apache.bookkeeper.client.AsyncCallback.AddCallback;
-import org.apache.bookkeeper.client.AsyncCallback.ReadCallback;
-import org.apache.bookkeeper.client.QuorumOpMonitor.PendingOp;
-import org.apache.bookkeeper.client.QuorumOpMonitor.PendingReadOp;
-import org.apache.bookkeeper.proto.ReadEntryCallback;
-import org.apache.bookkeeper.proto.WriteCallback;
-import org.apache.log4j.Logger;
-
-
-
-/**
- * Implements the quorum protocol.It basically handles requests coming 
- * from BookKeeper and forward to the appropriate BookieHandle objects.
- */
-
-public class QuorumEngine {
-    static Logger LOG = Logger.getLogger(QuorumEngine.class);
-
-    QuorumOpMonitor opMonitor;
-    ClientCBWorker cbWorker;
-    LedgerHandle lh;
-    int qRef = 0;
-    
-    /**
-     * Operation descriptor: Requests generated by BookKeeper.java
-     * upon client calls. There are three types of requests: READ, 
-     * ADD, STOP.
-     */
-    
-    static long idCounter; 
-    static synchronized long getOpId(){
-        return idCounter++;
-    }
-    
-    public static class Operation {
-        public static final int READ = 0;
-        public static final int ADD = 1;
-        public static final int STOP = 2;
-        
-        
-        int type;
-        LedgerHandle ledger;
-        long id;
-        int rc = 0;
-        boolean ready = false;
-        
-         public Operation(){
-             this.id = getOpId();
-         }
-            
-         long getId(){
-             return id;
-         }
-            
-
-        public static class AddOp extends Operation {
-            AddCallback cb;
-            Object ctx;
-            byte[] data;
-            long entry;
-            
-            public AddOp(LedgerHandle ledger, byte[] data, AddCallback cb, Object ctx){
-                type = Operation.ADD;
-            
-                this.data = data;
-                this.entry = ledger.incLast(); 
-                this.cb = cb;
-                this.ctx = ctx;
-                
-                this.ledger = ledger;
-            }
-            
-        }
-        
-        
-        public static class ReadOp extends Operation {
-            ReadCallback cb;
-            Object ctx;
-            long firstEntry;
-            long lastEntry;
-            LedgerEntry[] seq;
-            AtomicInteger counter;
-            HashMap<Long, AtomicInteger> nacks;
-            //boolean complete;
-            
-            public ReadOp(LedgerHandle ledger, long firstEntry, long lastEntry, ReadCallback cb, Object ctx){
-                type = READ;
-            
-                this.firstEntry = firstEntry;
-                this.lastEntry = lastEntry;
-                this.cb = cb;
-                this.ctx = ctx;
-                this.seq = new LedgerEntry[(int) (lastEntry - firstEntry + 1)];
-                this.counter = new AtomicInteger(0);
-                this.nacks = new HashMap<Long, AtomicInteger>();
-                //this.complete = false;
-                
-                this.ledger = ledger;
-            }
-        }
-        
-        public static class StopOp extends Operation {
-            public StopOp(){
-                type = STOP;
-            }
-        }
-        
-        
-        
-        
-        void setErrorCode(int rc){
-            this.rc = rc;
-        }
-        
-        int getErrorCode(){
-            return this.rc;
-        }
-        
-        synchronized boolean isReady(){
-                return ready;
-        }
-        
-        synchronized void setReady(){    
-              ready = true;
-              this.notify();
-        }
-        
-        LedgerHandle getLedger(){
-            return ledger;
-        }
-    }
-    
-    
-    public static class SubOp{
-     int bIndex;   
-     Operation op;
-     
-     public static class SubAddOp extends SubOp{
-         PendingOp pOp;
-         WriteCallback wcb;
-        
-         SubAddOp(Operation op, 
-                 PendingOp pOp, 
-                 int bIndex,
-                 WriteCallback wcb){
-             this.op = op;
-             this.pOp = pOp;
-             this.bIndex = bIndex;
-             this.wcb = wcb;
-         }
-     }
-    
-     public static class SubReadOp extends SubOp{
-         PendingReadOp pOp;
-         ReadEntryCallback rcb;
-         
-         SubReadOp(Operation op, 
-                 PendingReadOp pOp, 
-                 int bIndex, 
-                 ReadEntryCallback rcb){
-             this.op = op;
-             this.pOp = pOp;
-             this.bIndex = bIndex;
-             this.rcb = rcb;
-         }
-     }
-     
-     public static class SubStopOp extends SubOp{
-         SubStopOp(Operation op){
-             this.op = op;
-         }
-     }
-    }
-    
-    public QuorumEngine(LedgerHandle lh){ 
-        this.lh = lh;
-        this.opMonitor = new QuorumOpMonitor(lh);
-        QuorumEngine.idCounter = 0;
-        LOG.debug("Creating cbWorker");
-        this.cbWorker = ClientCBWorker.getInstance();
-        LOG.debug("Created cbWorker");
-    }
-  
-    /**
-     * Sends requests to BookieHandle instances. Methods in BookKeeper call
-     * this method to submit both add and read requests.
-     * 
-     * @param r Operation descriptor
-     */
-    void sendOp(Operation r)
-    throws InterruptedException, BKException {
-        int n;    
-        
-        switch(r.type){
-        case Operation.READ:
-            
-            Operation.ReadOp rOp = (Operation.ReadOp) r;
-            
-            LOG.debug("Adding read operation to opMonitor: " + rOp.firstEntry + ", " + rOp.lastEntry);
-            cbWorker.addOperation(r);
-            
-            for(long entry = rOp.firstEntry; 
-            entry <= rOp.lastEntry;
-            entry++){
-                long counter = 0;
-                PendingReadOp pROp = new PendingReadOp(lh);
-                
-                n = lh.getBookies(entry).size();
-                if(n < lh.getQuorumSize())
-                    throw BKException.create(Code.NotEnoughBookiesException);
-                
-                //Send requests to bookies
-                while(counter < lh.getQuorumSize()){
-                    int index = (int)((entry + counter++) % n);
-                    try{
-                        SubOp.SubReadOp sRead = new SubOp.SubReadOp(rOp, 
-                                pROp, 
-                                index,
-                                opMonitor);
-   
-                        BookieHandle bh = lh.getBookies(entry).get((index) % n); 
-                        if(bh.isEnabled()) bh.sendRead(lh, sRead, entry);            
-                    } catch(IOException e){
-                        LOG.error(e);
-                    }
-                }  
-            }
-  
-            break;
-        case Operation.ADD:
-            n = lh.getBookies().size();
-
-            if(n < lh.getQuorumSize())
-                throw BKException.create(Code.NotEnoughBookiesException);
-            
-            long counter = 0;
-            
-            cbWorker.addOperation(r);
-            Operation.AddOp aOp = (Operation.AddOp) r;
-            PendingOp pOp = new PendingOp();
-            ArrayList<BookieHandle> bookies;
-            
-            while(counter < lh.getQuorumSize()  ){
-                int index = (int)((aOp.entry + counter++) % n);
-                
-                try{
-                    SubOp.SubAddOp sAdd = new 
-                    SubOp.SubAddOp(aOp, 
-                            pOp, 
-                            index,
-                            opMonitor);
-                   
-                    lh.getBookies().get((index) % n).sendAdd(lh, sAdd, aOp.entry);
-                } catch (Exception io) {
-                    LOG.error("Error when sending entry: " + aOp.entry + ", " + index + ", " + io);
-                    counter--;
-                    n = lh.getBookies().size();
-                }
-            }
-            break;
-                case Operation.STOP:
-                    cbWorker.shutdown();
-                    break;
-        }
-    }
-    
-}

+ 0 - 472
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/QuorumOpMonitor.java

@@ -1,472 +0,0 @@
-package org.apache.bookkeeper.client;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Arrays;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-import java.security.InvalidKeyException;
-import javax.crypto.Mac; 
-
-
-import org.apache.bookkeeper.client.BookieHandle;
-import org.apache.bookkeeper.client.BKException;
-import org.apache.bookkeeper.client.BKException.Code;
-import org.apache.bookkeeper.client.LedgerHandle;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.AddOp;
-import org.apache.bookkeeper.client.QuorumEngine.Operation.ReadOp;
-import org.apache.bookkeeper.client.QuorumEngine.SubOp.SubAddOp;
-import org.apache.bookkeeper.client.QuorumEngine.SubOp.SubReadOp;
-import org.apache.bookkeeper.proto.BookieClient;
-import org.apache.bookkeeper.proto.ReadEntryCallback;
-import org.apache.bookkeeper.proto.WriteCallback;
-import org.apache.log4j.Logger;
-
-
-/**
- * Monitors reponses from bookies to requests of a client. It implements 
- * two interfaces of the proto package that correspond to callbacks from
- * BookieClient objects.
- * 
- */
-public class QuorumOpMonitor implements WriteCallback, ReadEntryCallback {
-    static Logger LOG = Logger.getLogger(QuorumOpMonitor.class);
-    
-    LedgerHandle lh;
-    
-    static final int MAXRETRIES = 2;
-    
-    
-    /**
-     * Message disgest instance
-     * 
-     */
-    MessageDigest digest = null;
-    int dLength;
-    
-    /** 
-     * Get digest instance if there is none.
-     * 
-     */
-    MessageDigest getDigestInstance(String alg)
-    throws NoSuchAlgorithmException {
-        if(digest == null){
-            digest = MessageDigest.getInstance(alg);
-        }
-        
-        return digest;
-    }
-    
-    public static class PendingOp{
-        //Operation op = null;
-        HashSet<Integer> bookieIdSent;
-        HashSet<Integer> bookieIdRecv;
-        int retries = 0;
-      
-        PendingOp(){
-            this.bookieIdSent = new HashSet<Integer>();
-            this.bookieIdRecv = new HashSet<Integer>();
-        }
-        
-    };
-    
-    
-    /**
-     * Objects of this type are used to keep track of the status of
-     * a given read request.
-     * 
-     */
-    
-    public static class PendingReadOp extends PendingOp{
-        /*
-         * Values for ongoing reads
-         */
-
-        ArrayList<ByteBuffer> proposedValues;
-                
-        PendingReadOp(LedgerHandle lh){
-            this.proposedValues =
-                new ArrayList<ByteBuffer>();
-        }    
-      
-    }
-    
-    QuorumOpMonitor(LedgerHandle lh){
-        this.lh = lh;
-        try{
-            this.dLength = getDigestInstance(lh.getDigestAlg()).getDigestLength();
-        } catch(NoSuchAlgorithmException e){
-            LOG.error("Problem with message digest: " + e);
-            this.dLength = 0;
-        }
-    }
-    
-   
-    /**
-     * Callback method for write operations. There is one callback for
-     * each write to a server.
-     * 
-     */
-    
-    public void writeComplete(int rc, long ledgerId, long entryId, Object ctx){ 
-        //PendingAddOp pOp;
-        //synchronized(pendingAdds){
-        //pOp = pendingAdds.get(entryId);
-        //}
-        SubAddOp sAdd = (SubAddOp) ctx;
-        PendingOp pOp = sAdd.pOp;
-        Integer sId = sAdd.bIndex;
-        
-        if(pOp == null){
-            LOG.error("No such an entry ID: " + entryId + "(" + ledgerId + ")");
-            return;
-        }
-        
-        ArrayList<BookieHandle> list = lh.getBookies();
-        if(rc == 0){
-            // Everything went ok with this op
-            synchronized(pOp){ 
-                pOp.bookieIdRecv.add(sId);
-                lh.setLastRecvCorrectly(sId, entryId);
-                if(pOp.bookieIdRecv.size() >= lh.getQuorumSize()){
-                    sAdd.op.setReady();     
-                }
-            }
-        } else {
-            //LOG.warn("Error sending write request: " + rc + " : " + ledgerId + ": " + lh.getBookies().size());
-            /*
-             * If ledger is closed already, then simply return
-             */
-            if(lh.getId() == -1){
-                LOG.warn("Ledger identifier is not valid");
-                return;
-            }
-            
-            HashSet<Integer> ids;
-              
-            synchronized(pOp){
-                pOp.bookieIdSent.add(sId);
-                ids = pOp.bookieIdSent;                
-                //Check if we tried all possible bookies already
-                if(ids.size() == lh.getBookies().size()){
-                    if(pOp.retries++ >= MAXRETRIES){
-                        //Call back with error code
-  
-                        sAdd.op.setErrorCode(BKDefs.ENR);
-                        sAdd.op.setReady();
-                        return;
-                    }
-                    
-                    ids.clear();
-                }
-                // Select another bookie that we haven't contacted yet
-                try{
-                    //LOG.info("Selecting another bookie " + entryId);
-                    int bCounter;
-                    if(sId >= (entryId % (lh.getBookies().size() + 1))){
-                        bCounter = sId - (((int) entryId) % (lh.getBookies().size() + 1));
-                    } else {
-                        bCounter = (lh.getBookies().size() + 1) - (((int) entryId) % (lh.getBookies().size() + 1)) - sId;
-                    }
-                    
-                    int tmpId = (((int) entryId) + lh.getQuorumSize()) % (lh.getBookies().size() + 1);
-                    int newId = tmpId % lh.getBookies().size();
-                    //LOG.info("Sending a new add operation to bookie: " + newId + ", " + lh.getBookies().get(newId).addr);
-                    
-                    BookieHandle bh = lh.getBookies().get(newId);
-                    
-                    //LOG.info("Got handle for " + newId);
-                    
-                    bh.sendAdd(lh, new SubAddOp(sAdd.op, 
-                            pOp, 
-                            newId, 
-                            this), entryId);
-               
-                    //LOG.info("Ended " + entryId + ", " + newId);
-                } catch(IOException e){
-                    LOG.error(e);
-                } catch(BKException e){
-                    LOG.error(e);
-                }
-            }
-        }       
-    }
-
-    
-    /**
-     * Callback method for read operations. There is one callback for
-     * each entry of a read request.
-     * 
-     * TODO: We might want to change the way a client application specify
-     * the quorum size. It is really loose now, and it allows an application
-     * to set any quorum size the client wants.
-     */
-    
-    public void readEntryComplete(int rc, long ledgerId, long entryId, ByteBuffer bb, Object ctx){
-        /*
-         * Collect responses, and reply when there are sufficient 
-         * answers.
-         */
-        if(rc == 0){
-            SubReadOp sRead = (SubReadOp) ctx;
-            ReadOp rOp = (ReadOp) sRead.op;
-            PendingReadOp pOp = sRead.pOp;
-            if(pOp != null){
-                HashSet<Integer> received = pOp.bookieIdRecv;
-                
-                boolean result = received.add(sRead.bIndex);
-                int counter = -1;
-                if(result){
-                    
-                    ByteBuffer voted = null;
-                    ArrayList<ByteBuffer> list;
-                    switch(lh.getQMode()){
-                    case VERIFIABLE:
-                        if(rOp.seq[(int) (entryId % (rOp.lastEntry - rOp.firstEntry + 1))] == null)
-                           try{
-                                voted = voteVerifiable(bb);
-                            } catch(NoSuchAlgorithmException e){
-                                LOG.error("Problem with message digest: " + e);
-                            } catch(BKException bke) {
-                                LOG.error(bke.toString() + "( " + ledgerId + ", " + entryId + ", " + pOp.bookieIdRecv + ")");
-                                countNacks((ReadOp) ((SubReadOp) ctx).op, (SubReadOp) ctx, ledgerId, entryId);
-                            } catch(InvalidKeyException e){
-                                LOG.error(e);
-                            }
- 
-                            if(voted != null) { 
-                                if(voted.capacity() - dLength > 0){
-                                    byte[] data = new byte[voted.capacity() - dLength - 24];
-                                    voted.position(24);                                    
-                                    voted.get(data, 0, data.length);
-                                    //LOG.warn("Data length (" + entryId + "): " + data.length);
-                                    counter = addNewEntry(new LedgerEntry(ledgerId, entryId, data), rOp);
-                                } 
-                            }
-                               
-                        break;
-                    case GENERIC:
-                        list = pOp.proposedValues;
-                        
-                        synchronized(list){
-                            if(rOp.seq[(int) (entryId % (rOp.lastEntry - rOp.firstEntry + 1))] == null){
-                                list.add(bb);
-                                bb.position(24);
-                                if(list.size() >= ((lh.getQuorumSize() + 1)/2)){
-                                    voted = voteGeneric(list, (lh.getQuorumSize() + 1)/2);
-                                }
-                            }
-                        }
-                        
-                                    
-                        if(voted != null){
-                            LOG.debug("Voted: " + voted.array());
-                            byte[] data = new byte[voted.capacity() - 24];
-                            voted.position(24);
-                            voted.get(data, 0, data.length);
-                            counter = addNewEntry(new LedgerEntry(ledgerId, entryId, data), rOp);
-                        }
-                                
-                                
-                        break;
-                    case FREEFORM:
-                        list = pOp.proposedValues;
-                        LOG.debug("List length before: " + list.size());
-                        synchronized(list){
-                            if(list.size() == lh.getQuorumSize()){
-                                voted = voteFree(list);
-                            }
-                        }
-                        
-                        if(voted != null){
-                            LOG.debug("Voted: " + voted.array());
-                            byte[] data = new byte[voted.capacity() - 24];
-                            voted.position(24);
-                            voted.get(data, 0, data.length);
-                            counter = addNewEntry(new LedgerEntry(ledgerId, entryId, voted.array()), rOp);
-                        }                      
-                    }   
-        
-                    if((counter == (rOp.lastEntry - rOp.firstEntry + 1)) && 
-                            !sRead.op.isReady()){
-                        sRead.op.setReady();
-                    }
-            
-                    
-                    //long diff = rOp.lastEntry - rOp.firstEntry;
-                    //LOG.debug("Counter: " + rOp.counter + ", " + diff);
-                }
-            }
-        } else {
-            /*
-             * Have to count the number of negative responses
-             */
-            countNacks((ReadOp) ((SubReadOp) ctx).op, (SubReadOp) ctx, ledgerId, entryId);
-            
-        }
-    }
-    
-    
-    /**
-     * Counts negative responses
-     * 
-     * @param   rOp read operation
-     * @param   sRead   specific read sub-operation
-     */
-    
-    synchronized void countNacks(ReadOp rOp, SubReadOp sRead, long ledgerId, long entryId){
-        
-        if(!rOp.nacks.containsKey(entryId)){
-            rOp.nacks.put(entryId, new AtomicInteger(0));
-        }
-        
-        if(rOp.nacks.get(entryId).incrementAndGet() >= lh.getThreshold()){
-            int counter = -1;
-            //LOG.warn("Giving up on " + entryId + "(" + lh.getThreshold() + ")");
-            counter = addNewEntry(new LedgerEntry(ledgerId, entryId, null), rOp);
-            
-            if((counter == (rOp.lastEntry - rOp.firstEntry + 1)) && 
-                    !sRead.op.isReady()){
-                
-                sRead.op.setReady();
-            }
-        }
-    }
-    
-    /**
-     * Verify if the set of votes in the list can produce a correct answer
-     * for verifiable data.
-     * 
-     * @param list
-     * @return
-     */
-    
-    
-    private ByteBuffer voteVerifiable(ByteBuffer bb) 
-    throws NoSuchAlgorithmException, InvalidKeyException, BKException{
-        /*
-         * Check if checksum matches
-         */
-        
-        Mac mac = ((BookieClient) Thread.currentThread()).getMac("HmacSHA1", lh.getMacKey());
-        int dlength = mac.getMacLength();
-       
-        if(bb.capacity() <= dlength){
-            LOG.warn("Something wrong with this entry, length smaller than digest length");
-            return null;
-        }
-        
-        byte[] data = new byte[bb.capacity() - dlength];
-        bb.get(data, 0, bb.capacity() - dlength);
-        
-        
-        byte[] sig = new byte[dlength];
-        bb.position(bb.capacity() - dlength);
-        bb.get(sig, 0, dlength);
-
-        bb.rewind();
-        
-        byte[] msgDigest = mac.doFinal(data);
-        if(Arrays.equals(msgDigest, sig)){
-            return bb;
-        } else {
-            LOG.error("Entry id: " + new String(msgDigest) + new String(sig));
-            throw BKException.create(Code.DigestMatchException);
-        }
-        
-    }
-    
-    /**
-     * Verify if the set of votes in the list can produce a correct answer
-     * for generic data.
-     * 
-     * @param list
-     * @return
-     */
-        
-    private ByteBuffer voteGeneric(ArrayList<ByteBuffer> list, int threshold){  
-        HashMap<ByteBuffer, Integer> map = new HashMap<ByteBuffer, Integer>();
-        for(ByteBuffer bb : list){  
-            if(!map.containsKey(bb)){
-                map.put(bb, new Integer(0));
-            } else LOG.debug("Not equal");
-            
-            if(bb != null)
-                map.put(bb, map.get(bb) + 1);
-            
-            if(map.get(bb) >= threshold)
-                return bb;  
-        }
-        
-        return null;   
-    }
-
-    /**
-     * Verify if the set of votes in the list can produce a correct answer
-     * for generic data.
-     * 
-     * @param list
-     * @return
-     */
-        
-    private ByteBuffer voteFree(ArrayList<ByteBuffer> list){  
-        HashMap<ByteBuffer, Integer> map = new HashMap<ByteBuffer, Integer>();
-        for(ByteBuffer bb : list){
-            bb.position(24);
-            if(!map.containsKey(bb)){
-                map.put(bb, Integer.valueOf(0));
-            }
-            map.put(bb, map.get(bb) + 1);
-            
-            if(map.get(bb) == list.size())
-                return bb;
-        }
-        
-        return null;   
-    }
-    
-    /**
-     * Add new entry to the list of received. 
-     * 
-     * @param le	ledger entry to add to list
-     * @param op	read operation metadata
-     */
-    
-    private int addNewEntry(LedgerEntry le, ReadOp op){
-        long index = le.getEntryId() % (op.lastEntry - op.firstEntry + 1);
-        if(op.seq[(int) index] == null){
-            if(le.getEntry() == null) LOG.warn("Ledger entry is null (" + le.getEntryId() + ")");
-            //if(le.getEntryId() % 100 == 0) LOG.info("New entry: " + le.getEntryId() + ")");
-            op.seq[(int) index] = le;
-            
-            return op.counter.incrementAndGet();
-        }
-        
-        return -1;
-    }
-}

+ 87 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/RoundRobinDistributionSchedule.java

@@ -0,0 +1,87 @@
+package org.apache.bookkeeper.client;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.bookkeeper.util.MathUtils;
+
+/**
+ * A specific {@link DistributionSchedule} that places entries in round-robin
+ * fashion. For ensemble size 3, and quorum size 2, Entry 0 goes to bookie 0 and
+ * 1, entry 1 goes to bookie 1 and 2, and entry 2 goes to bookie 2 and 0, and so
+ * on.
+ * 
+ */
+class RoundRobinDistributionSchedule implements DistributionSchedule {
+    int quorumSize;
+    int ensembleSize;
+
+    // covered[i] is true if the quorum starting at bookie index i has been
+    // covered by a recovery reply
+    boolean[] covered = null;
+    int numQuorumsUncovered;
+
+    public RoundRobinDistributionSchedule(int quorumSize, int ensembleSize) {
+        this.quorumSize = quorumSize;
+        this.ensembleSize = ensembleSize;
+    }
+
+    @Override
+    public int getBookieIndex(long entryId, int replicaIndex) {
+        return (int) ((entryId + replicaIndex) % ensembleSize);
+    }
+
+    @Override
+    public int getReplicaIndex(long entryId, int bookieIndex) {
+        // NOTE: Java's % operator returns the sign of the dividend and is hence
+        // not always positive
+
+        int replicaIndex = MathUtils.signSafeMod(bookieIndex - entryId, ensembleSize);
+
+        return replicaIndex < quorumSize ? replicaIndex : -1;
+
+    }
+
+    public synchronized boolean canProceedWithRecovery(int bookieIndexHeardFrom) {
+        if (covered == null) {
+            covered = new boolean[ensembleSize];
+            numQuorumsUncovered = ensembleSize;
+        }
+
+        if (numQuorumsUncovered == 0) {
+            return true;
+        }
+
+        for (int i = 0; i < quorumSize; i++) {
+            int quorumStartIndex = MathUtils.signSafeMod(bookieIndexHeardFrom - i, ensembleSize);
+            if (!covered[quorumStartIndex]) {
+                covered[quorumStartIndex] = true;
+                numQuorumsUncovered--;
+
+                if (numQuorumsUncovered == 0) {
+                    return true;
+                }
+            }
+
+        }
+
+        return false;
+
+    }
+
+}

+ 85 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/client/SyncCounter.java

@@ -0,0 +1,85 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.bookkeeper.client;
+
+import java.util.Enumeration;
+
+/**
+ * Implements objects to help with the synchronization of asynchronous calls
+ * 
+ */
+
+class SyncCounter {
+    int i;
+    int rc;
+    int total;
+    Enumeration<LedgerEntry> seq = null;
+    LedgerHandle lh = null;
+
+    synchronized void inc() {
+        i++;
+        total++;
+    }
+
+    synchronized void dec() {
+        i--;
+        notifyAll();
+    }
+
+    synchronized void block(int limit) throws InterruptedException {
+        while (i > limit) {
+            int prev = i;
+            wait();
+            if (i == prev) {
+                break;
+            }
+        }
+    }
+
+    synchronized int total() {
+        return total;
+    }
+
+    void setrc(int rc) {
+        this.rc = rc;
+    }
+
+    int getrc() {
+        return rc;
+    }
+
+    void setSequence(Enumeration<LedgerEntry> seq) {
+        this.seq = seq;
+    }
+
+    Enumeration<LedgerEntry> getSequence() {
+        return seq;
+    }
+
+    void setLh(LedgerHandle lh) {
+        this.lh = lh;
+    }
+
+    LedgerHandle getLh() {
+        return lh;
+    }
+}

+ 82 - 345
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/BookieClient.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.proto;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,391 +21,126 @@ package org.apache.bookkeeper.proto;
  * 
  */
 
-
 import java.io.IOException;
 import java.net.InetSocketAddress;
-import java.net.ConnectException;
-import java.nio.ByteBuffer;
-import java.nio.channels.SocketChannel;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.Semaphore;
-import java.util.concurrent.TimeUnit;
-import java.util.Enumeration;
-import java.security.NoSuchAlgorithmException;
-import java.security.InvalidKeyException;
-import java.security.MessageDigest;
-import javax.crypto.Mac; 
-import javax.crypto.spec.SecretKeySpec;
-
-//import org.apache.bookkeeper.client.AsyncCallback.FailCallback;
-import org.apache.bookkeeper.client.BookieHandle;
-import org.apache.bookkeeper.proto.ReadEntryCallback;
-import org.apache.bookkeeper.proto.WriteCallback;
+import java.util.concurrent.Executors;
+import java.util.concurrent.atomic.AtomicLong;
+import org.apache.bookkeeper.client.BKException;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.ReadEntryCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.GenericCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.WriteCallback;
+import org.apache.bookkeeper.util.OrderedSafeExecutor;
 import org.apache.log4j.Logger;
-
-
+import org.jboss.netty.buffer.ChannelBuffer;
+import org.jboss.netty.buffer.ChannelBuffers;
+import org.jboss.netty.channel.socket.ClientSocketChannelFactory;
+import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory;
 
 /**
- * Implements the client-side part of the BookKeeper protocol. 
+ * Implements the client-side part of the BookKeeper protocol.
  * 
- */    
-public class BookieClient extends Thread {
-	Logger LOG = Logger.getLogger(BookieClient.class);
-    SocketChannel sock;
-    int myCounter = 0;
+ */
+public class BookieClient {
+    static final Logger LOG = Logger.getLogger(BookieClient.class);
 
-    public BookieClient(InetSocketAddress addr, int recvTimeout)
-    throws IOException, ConnectException { 
-        startConnection(addr, recvTimeout);
-    }
-    
-    public BookieClient(String host, int port, int recvTimeout)
-    throws IOException, ConnectException {
-        this(new InetSocketAddress(host, port), recvTimeout);
-    }
-    
-    public void startConnection(InetSocketAddress addr, int recvTimeout)
-    throws IOException, ConnectException {
-        sock = SocketChannel.open(addr);
-        setDaemon(true);
-        //sock.configureBlocking(false);
-        sock.socket().setSoTimeout(recvTimeout);
-        sock.socket().setTcpNoDelay(true);
-        start();        
-    }
-    
-    private static class Completion<T> {
-        Completion(T cb, Object ctx) {
-            this.cb = cb;
-            this.ctx = ctx;
-        }
+    // This is global state that should be across all BookieClients
+    AtomicLong totalBytesOutstanding = new AtomicLong();
 
-        T cb;
-        Object ctx;
-    }
+    OrderedSafeExecutor executor;
+    ClientSocketChannelFactory channelFactory;
+    ConcurrentHashMap<InetSocketAddress, PerChannelBookieClient> channels = new ConcurrentHashMap<InetSocketAddress, PerChannelBookieClient>();
 
-    private static class CompletionKey {
-        long ledgerId;
-        long entryId;
+    public BookieClient(ClientSocketChannelFactory channelFactory, OrderedSafeExecutor executor) {
+        this.channelFactory = channelFactory;
+        this.executor = executor;
+    }
 
-        CompletionKey(long ledgerId, long entryId) {
-            this.ledgerId = ledgerId;
-            this.entryId = entryId;
-        }
+    public PerChannelBookieClient lookupClient(InetSocketAddress addr) {
+        PerChannelBookieClient channel = channels.get(addr);
 
-        @Override
-        public boolean equals(Object obj) {
-            if (!(obj instanceof CompletionKey) || obj == null) {
-                return false;
+        if (channel == null) {
+            channel = new PerChannelBookieClient(executor, channelFactory, addr, totalBytesOutstanding);
+            PerChannelBookieClient prevChannel = channels.putIfAbsent(addr, channel);
+            if (prevChannel != null) {
+                channel = prevChannel;
             }
-            CompletionKey that = (CompletionKey) obj;
-            return this.ledgerId == that.ledgerId && this.entryId == that.entryId;
-        }
-
-        @Override
-        public int hashCode() {
-            return ((int) ledgerId << 16) ^ ((int) entryId);
         }
 
+        return channel;
     }
 
-    ConcurrentHashMap<CompletionKey, Completion<WriteCallback>> addCompletions = 
-        new ConcurrentHashMap<CompletionKey, Completion<WriteCallback>>();
-    
-    ConcurrentHashMap<CompletionKey, Completion<ReadEntryCallback>> readCompletions =
-        new ConcurrentHashMap<CompletionKey, Completion<ReadEntryCallback>>();
-    
-    /*
-     * Use this semaphore to control the number of completion key in both addCompletions
-     * and readCompletions. This is more of a problem for readCompletions because one
-     * readEntries opertion is expanded into individual operations to read entries.
-     */
-    Semaphore completionSemaphore = new Semaphore(3000);
-    
-   
-    /**
-     * Message disgest instance
-     * 
-     */
-    MessageDigest digest = null;
-    
-    /** 
-     * Get digest instance if there is none.
-     * 
-     */
-    public MessageDigest getDigestInstance(String alg)
-    throws NoSuchAlgorithmException {
-        if(digest == null){
-            digest = MessageDigest.getInstance(alg);
-        }
-        
-        return digest;
-    }
-    
-    /**
-     * Mac instance
-     * 
-     */
-    Mac mac = null;
-    
-    public Mac getMac(String alg, byte[] key)
-    throws NoSuchAlgorithmException, InvalidKeyException {
-        if(mac == null){
-            mac = Mac.getInstance(alg);
-            mac.init(new SecretKeySpec(key, "HmacSHA1"));
-        }
-        
-        return mac;
-    }
-    
-    /**
-     * Send addEntry operation to bookie. It throws an IOException
-     * if either the write to the socket fails or it takes too long
-     * to obtain a permit to send another request, which possibly 
-     * implies that the corresponding bookie is down.
-     * 
-     * @param ledgerId	ledger identifier
-     * @param entryId 	entry identifier
-     * @param cb		object implementing callback method
-     * @param ctx		control object
-     * @throws IOException
-     * @throws InterruptedException
-     */
-    synchronized public void addEntry(long ledgerId, byte[] masterKey, long entryId,
-            ByteBuffer entry, WriteCallback cb, Object ctx) 
-    throws IOException, InterruptedException {
-        
-        if(cb == null)
-            LOG.error("WriteCallback object is null: " + entryId);
-        addCompletions.put(new CompletionKey(ledgerId, entryId),
-                new Completion<WriteCallback>(cb, ctx));
-
-        ByteBuffer tmpEntry = ByteBuffer.allocate(entry.remaining() + 44);
+    public void addEntry(final InetSocketAddress addr, final long ledgerId, final byte[] masterKey, final long entryId,
+            final ChannelBuffer toSend, final WriteCallback cb, final Object ctx) {
 
-        tmpEntry.position(4);
-        tmpEntry.putInt(BookieProtocol.ADDENTRY);
-        tmpEntry.put(masterKey);
-        tmpEntry.putLong(ledgerId);
-        tmpEntry.putLong(entryId);
-        tmpEntry.put(entry);
-        tmpEntry.position(0);
-        
-        // 4 bytes for the message type
-        tmpEntry.putInt(tmpEntry.remaining() - 4);
-        tmpEntry.position(0);
+        final PerChannelBookieClient client = lookupClient(addr);
 
-        
-        if(!sock.isConnected() || 
-                !completionSemaphore.tryAcquire(1000, TimeUnit.MILLISECONDS)){ 
-            throw new IOException();
-        } else sock.write(tmpEntry);
+        client.connectIfNeededAndDoOp(new GenericCallback<Void>() {
+            @Override
+            public void operationComplete(int rc, Void result) {
+                if (rc != BKException.Code.OK) {
+                    cb.writeComplete(rc, ledgerId, entryId, addr, ctx);
+                    return;
+                }
+                client.addEntry(ledgerId, masterKey, entryId, toSend, cb, ctx);
+            }
+        });
     }
-    
-    /**
-     * Send readEntry operation to bookie. It throws an IOException
-     * if either the write to the socket fails or it takes too long
-     * to obtain a permit to send another request, which possibly 
-     * implies that the corresponding bookie is down.
-     * 
-     * @param ledgerId	ledger identifier
-     * @param entryId	entry identifier
-     * @param cb		object implementing callback method
-     * @param ctx		control object
-     * @throws IOException
-     */
-    synchronized public void readEntry(long ledgerId, long entryId,
-            ReadEntryCallback cb, Object ctx) 
-    throws IOException, InterruptedException {
-        //LOG.info("Entry id: " + entryId);
-    	//completionSemaphore.acquire();
-        readCompletions.put(new CompletionKey(ledgerId, entryId),
-                new Completion<ReadEntryCallback>(cb, ctx));
-        
-        ByteBuffer tmpEntry = ByteBuffer.allocate(8 + 8 + 8);
-        tmpEntry.putInt(20);
-        tmpEntry.putInt(BookieProtocol.READENTRY);
-        tmpEntry.putLong(ledgerId);
-        tmpEntry.putLong(entryId);
-        tmpEntry.position(0);
 
-        if(!sock.isConnected() || 
-                !completionSemaphore.tryAcquire(1000, TimeUnit.MILLISECONDS)){ 
-            throw new IOException();
-        } else sock.write(tmpEntry);
-    }
-    
-    private void readFully(ByteBuffer bb) throws IOException {
-        while(bb.remaining() > 0) {
-            sock.read(bb);
-        }
-    }
-    
-    Semaphore running = new Semaphore(0);
-    public void run() {
-        int len = -1;
-        ByteBuffer lenBuffer = ByteBuffer.allocate(4);
-        int type = -1, rc = -1;
-        try {
-            while(sock.isConnected()) {
-                lenBuffer.clear();
-                readFully(lenBuffer);
-                lenBuffer.flip();
-                len = lenBuffer.getInt();
-                ByteBuffer bb = ByteBuffer.allocate(len);
-                readFully(bb);
-                bb.flip();
-                type = bb.getInt();
-                rc = bb.getInt();
- 
-                switch(type) {
-                case BookieProtocol.ADDENTRY:
-                {                    
-                    long ledgerId = bb.getLong();
-                    long entryId = bb.getLong();
+    public void readEntry(final InetSocketAddress addr, final long ledgerId, final long entryId,
+            final ReadEntryCallback cb, final Object ctx) {
 
-                    Completion<WriteCallback> ac;
-                    ac = addCompletions.remove(new CompletionKey(ledgerId, entryId));
-                    completionSemaphore.release();
-                    if (ac != null) {
-                        ac.cb.writeComplete(rc, ledgerId, entryId, ac.ctx);
-                    } else {
-                        LOG.error("Callback object null: " + ledgerId + " : " + entryId);
-                    }
+        final PerChannelBookieClient client = lookupClient(addr);
 
-                    break;
-                }
-                case BookieProtocol.READENTRY:
-                {
-                    long ledgerId = bb.getLong();
-                    long entryId = bb.getLong();
-                    
-                    bb.position(24);
-                    byte[] data = new byte[bb.capacity() - 24];
-                    bb.get(data);
-                    ByteBuffer entryData = ByteBuffer.wrap(data);         
-                    
-                    CompletionKey key = new CompletionKey(ledgerId, entryId);
-                    Completion<ReadEntryCallback> c;
-                    
-                    if(readCompletions.containsKey(key)){
-                            c = readCompletions.remove(key);
-                    }
-                    else{    
-                            /*
-                             * This is a special case. When recovering a ledger, a client submits
-                             * a read request with id -1, and receives a response with a different
-                             * entry id.
-                             */
-                            c = readCompletions.remove(new CompletionKey(ledgerId, -1));
-                    }
-                    completionSemaphore.release();
-                    
-                    if (c != null) {
-                        c.cb.readEntryComplete(rc, 
-                                ledgerId, 
-                                entryId, 
-                                entryData, 
-                                c.ctx);
-                    }
-                    break;
-                }
-                default:
-                    System.err.println("Got error " + rc + " for type " + type);
+        client.connectIfNeededAndDoOp(new GenericCallback<Void>() {
+            @Override
+            public void operationComplete(int rc, Void result) {
+
+                if (rc != BKException.Code.OK) {
+                    cb.readEntryComplete(rc, ledgerId, entryId, null, ctx);
+                    return;
                 }
+                client.readEntry(ledgerId, entryId, cb, ctx);
             }
-            
-        } catch(Exception e) {
-            LOG.error("Len = " + len + ", Type = " + type + ", rc = " + rc);
-        }
-        running.release();
-        
-    }
-    
-    /**
-     * Errors out pending entries. We call this method from one thread to avoid
-     * concurrent executions to QuorumOpMonitor (implements callbacks). It seems
-     * simpler to call it from BookieHandle instead of calling directly from here.
-     */
-    
-    public void errorOut(){
-        LOG.info("Erroring out pending entries");
-    
-        for (Enumeration<CompletionKey> e = addCompletions.keys() ; e.hasMoreElements() ;) {
-            CompletionKey key = e.nextElement();
-            Completion<WriteCallback> ac = addCompletions.remove(key);
-            if(ac != null){
-                completionSemaphore.release();
-                ac.cb.writeComplete(-1, key.ledgerId, key.entryId, ac.ctx);
-            }
-        }
-        
-        LOG.info("Finished erroring out pending add entries");
-         
-        for (Enumeration<CompletionKey> e = readCompletions.keys() ; e.hasMoreElements() ;) {
-            CompletionKey key = e.nextElement();
-            Completion<ReadEntryCallback> ac = readCompletions.remove(key);
-                
-            if(ac != null){
-                completionSemaphore.release();
-                ac.cb.readEntryComplete(-1, key.ledgerId, key.entryId, null, ac.ctx);
-            }
-        }
-        
-        LOG.info("Finished erroring out pending read entries");
+        });
     }
 
-    /**
-     * Halts client.
-     */
-    
-    public void halt() {
-        try{
-            sock.close();
-        } catch(IOException e) {
-            LOG.warn("Exception while closing socket");
-        }
-        
-        try{
-            running.acquire();
-        } catch(InterruptedException e){
-            LOG.error("Interrupted while waiting for running semaphore to acquire lock");
+    public void close(){
+        for (PerChannelBookieClient channel: channels.values()){
+            channel.close();
         }
     }
-    
-    /**
-     * Returns the status of the socket of this bookie client.
-     * 
-     * @return boolean
-     */
-    public boolean isConnected(){
-        return sock.isConnected();
-    }
 
     private static class Counter {
         int i;
         int total;
+
         synchronized void inc() {
             i++;
             total++;
         }
+
         synchronized void dec() {
             i--;
             notifyAll();
         }
+
         synchronized void wait(int limit) throws InterruptedException {
-            while(i > limit) {
+            while (i > limit) {
                 wait();
             }
         }
+
         synchronized int total() {
             return total;
         }
     }
+
     /**
      * @param args
-     * @throws IOException 
-     * @throws NumberFormatException 
-     * @throws InterruptedException 
+     * @throws IOException
+     * @throws NumberFormatException
+     * @throws InterruptedException
      */
     public static void main(String[] args) throws NumberFormatException, IOException, InterruptedException {
         if (args.length != 3) {
@@ -413,8 +149,8 @@ public class BookieClient extends Thread {
         }
         WriteCallback cb = new WriteCallback() {
 
-            public void writeComplete(int rc, long ledger, long entry, Object ctx) {
-                Counter counter = (Counter)ctx;
+            public void writeComplete(int rc, long ledger, long entry, InetSocketAddress addr, Object ctx) {
+                Counter counter = (Counter) ctx;
                 counter.dec();
                 if (rc != 0) {
                     System.out.println("rc = " + rc + " for " + entry + "@" + ledger);
@@ -424,18 +160,19 @@ public class BookieClient extends Thread {
         Counter counter = new Counter();
         byte hello[] = "hello".getBytes();
         long ledger = Long.parseLong(args[2]);
-        BookieClient bc = new BookieClient(args[0], Integer.parseInt(args[1]), 5000);
-        for(int i = 0; i < 100000; i++) {
-            ByteBuffer entry = ByteBuffer.allocate(100);
-            entry.putLong(ledger);
-            entry.putLong(i);
-            entry.putInt(0);
-            entry.put(hello);
-            entry.flip();
+        ClientSocketChannelFactory channelFactory = new NioClientSocketChannelFactory(Executors.newCachedThreadPool(), Executors
+                .newCachedThreadPool());
+        OrderedSafeExecutor executor = new OrderedSafeExecutor(1);
+        BookieClient bc = new BookieClient(channelFactory, executor);
+        InetSocketAddress addr = new InetSocketAddress(args[0], Integer.parseInt(args[1]));
+
+        for (int i = 0; i < 100000; i++) {
             counter.inc();
-            bc.addEntry(ledger, new byte[0], i, entry, cb, counter);
+            bc.addEntry(addr, ledger, new byte[0], i, ChannelBuffers.wrappedBuffer(hello), cb, counter);
         }
         counter.wait(0);
         System.out.println("Total = " + counter.total());
+        channelFactory.releaseExternalResources();
+        executor.shutdown();
     }
 }

+ 19 - 20
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/BookieProtocol.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.proto;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,33 +21,31 @@ package org.apache.bookkeeper.proto;
  * 
  */
 
-
 /**
- * The packets of the Bookie protocol all have a 4-byte integer
- * indicating the type of request or response at the very beginning
- * of the packet followed by a payload.
- *
+ * The packets of the Bookie protocol all have a 4-byte integer indicating the
+ * type of request or response at the very beginning of the packet followed by a
+ * payload.
+ * 
  */
 public interface BookieProtocol {
     /**
-     * The Add entry request payload will be a ledger entry exactly
-     * as it should be logged. The response payload will be a 4-byte
-     * integer that has the error code followed by the 8-byte
-     * ledger number and 8-byte entry number of the entry written.
+     * The Add entry request payload will be a ledger entry exactly as it should
+     * be logged. The response payload will be a 4-byte integer that has the
+     * error code followed by the 8-byte ledger number and 8-byte entry number
+     * of the entry written.
      */
     public static final int ADDENTRY = 1;
     /**
-     * The Read entry request payload will be the ledger number and
-     * entry number to read. (The ledger number is an 8-byte integer
-     * and the entry number is a 8-byte integer.) The
-     * response payload will be a 4-byte integer representing an 
-     * error code and a ledger entry if the error code is EOK, otherwise
-     * it will be the 8-byte ledger number and the 4-byte entry number
-     * requested. (Note that the first sixteen bytes of the entry happen
-     * to be the ledger number and entry number as well.)
+     * The Read entry request payload will be the ledger number and entry number
+     * to read. (The ledger number is an 8-byte integer and the entry number is
+     * a 8-byte integer.) The response payload will be a 4-byte integer
+     * representing an error code and a ledger entry if the error code is EOK,
+     * otherwise it will be the 8-byte ledger number and the 4-byte entry number
+     * requested. (Note that the first sixteen bytes of the entry happen to be
+     * the ledger number and entry number as well.)
      */
     public static final int READENTRY = 2;
-    
+
     /**
      * The error code that indicates success
      */
@@ -67,10 +66,10 @@ public interface BookieProtocol {
      * General error occurred at the server
      */
     public static final int EIO = 101;
-    
+
     /**
      * Unauthorized access to ledger
      */
     public static final int EUA = 102;
-    
+
 }

+ 52 - 44
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/BookieServer.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.proto;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,9 +21,9 @@ package org.apache.bookkeeper.proto;
  * 
  */
 
-
 import java.io.File;
 import java.io.IOException;
+import java.net.InetSocketAddress;
 import java.nio.ByteBuffer;
 
 import org.apache.bookkeeper.bookie.Bookie;
@@ -30,107 +31,114 @@ import org.apache.bookkeeper.bookie.BookieException;
 import org.apache.bookkeeper.proto.NIOServerFactory.Cnxn;
 import org.apache.log4j.Logger;
 
-
-
 /**
  * Implements the server-side part of the BookKeeper protocol.
- *
+ * 
  */
-public class BookieServer implements NIOServerFactory.PacketProcessor, WriteCallback {
+public class BookieServer implements NIOServerFactory.PacketProcessor, BookkeeperInternalCallbacks.WriteCallback {
     int port;
     NIOServerFactory nioServerFactory;
     volatile boolean down = false;
     Bookie bookie;
     static Logger LOG = Logger.getLogger(BookieServer.class);
-    
-    public BookieServer(int port, File journalDirectory, File ledgerDirectories[]) {
+
+    public BookieServer(int port, File journalDirectory, File ledgerDirectories[]) throws IOException {
         this.port = port;
         this.bookie = new Bookie(journalDirectory, ledgerDirectories);
     }
+
     public void start() throws IOException {
         nioServerFactory = new NIOServerFactory(port, this);
     }
+
     public void shutdown() throws InterruptedException {
         down = true;
         nioServerFactory.shutdown();
         bookie.shutdown();
     }
-    public boolean isDown(){
+
+    public boolean isDown() {
         return down;
     }
+
     public void join() throws InterruptedException {
         nioServerFactory.join();
     }
+
     /**
      * @param args
-     * @throws IOException 
-     * @throws InterruptedException 
+     * @throws IOException
+     * @throws InterruptedException
      */
     public static void main(String[] args) throws IOException, InterruptedException {
-    	if (args.length < 3) {
+        if (args.length < 3) {
             System.err.println("USAGE: BookieServer port journalDirectory ledgerDirectory [ledgerDirectory]*");
             return;
         }
         int port = Integer.parseInt(args[0]);
         File journalDirectory = new File(args[1]);
-        File ledgerDirectory[] = new File[args.length-2];
+        File ledgerDirectory[] = new File[args.length - 2];
         StringBuilder sb = new StringBuilder();
-        for(int i = 0; i < ledgerDirectory.length; i++) {
-            ledgerDirectory[i] = new File(args[i+2]);
+        for (int i = 0; i < ledgerDirectory.length; i++) {
+            ledgerDirectory[i] = new File(args[i + 2]);
             if (i != 0) {
                 sb.append(',');
             }
             sb.append(ledgerDirectory[i]);
         }
-        String hello = String.format("Hello, I'm your bookie, listening on port %1$s. Journals are in %2$s. Ledgers are stored in %3$s.", port, journalDirectory, sb);
+        String hello = String.format(
+                "Hello, I'm your bookie, listening on port %1$s. Journals are in %2$s. Ledgers are stored in %3$s.",
+                port, journalDirectory, sb);
         LOG.info(hello);
         BookieServer bs = new BookieServer(port, journalDirectory, ledgerDirectory);
         bs.start();
         bs.join();
     }
 
-   
     public void processPacket(ByteBuffer packet, Cnxn src) {
         int type = packet.getInt();
-        switch(type) {
+        switch (type) {
         case BookieProtocol.ADDENTRY:
             try {
                 byte[] masterKey = new byte[20];
                 packet.get(masterKey, 0, 20);
-                //LOG.debug("Master key: " + new String(masterKey));
+                // LOG.debug("Master key: " + new String(masterKey));
                 bookie.addEntry(packet.slice(), this, src, masterKey);
-            } catch(IOException e) {
-                if (LOG.isTraceEnabled()) {
-                    ByteBuffer bb = packet.duplicate();
-    
-                    long ledgerId = bb.getLong();
-                    long entryId = bb.getLong();
-                    LOG.trace("Error reading " + entryId + "@" + ledgerId, e);
-                }
-                ByteBuffer eio = ByteBuffer.allocate(8);
+            } catch (IOException e) {
+                ByteBuffer bb = packet.duplicate();
+
+                long ledgerId = bb.getLong();
+                long entryId = bb.getLong();
+                LOG.error("Error writing " + entryId + "@" + ledgerId, e);
+                ByteBuffer eio = ByteBuffer.allocate(8 + 16);
                 eio.putInt(type);
                 eio.putInt(BookieProtocol.EIO);
+                eio.putLong(ledgerId);
+                eio.putLong(entryId);
                 eio.flip();
-                src.sendResponse(new ByteBuffer[] {eio});
-            } catch(BookieException e){
+                src.sendResponse(new ByteBuffer[] { eio });
+            } catch (BookieException e) {
                 ByteBuffer bb = packet.duplicate();
                 long ledgerId = bb.getLong();
-                
+                long entryId = bb.getLong();
+
                 LOG.error("Unauthorized access to ledger " + ledgerId);
-                
-                ByteBuffer eio = ByteBuffer.allocate(8);
+
+                ByteBuffer eio = ByteBuffer.allocate(8 + 16);
                 eio.putInt(type);
                 eio.putInt(BookieProtocol.EUA);
+                eio.putLong(ledgerId);
+                eio.putLong(entryId);
                 eio.flip();
-                src.sendResponse(new ByteBuffer[] {eio});
+                src.sendResponse(new ByteBuffer[] { eio });
             }
             break;
         case BookieProtocol.READENTRY:
             ByteBuffer[] rsp = new ByteBuffer[2];
-            ByteBuffer rc = ByteBuffer.allocate(8+8+8);
+            ByteBuffer rc = ByteBuffer.allocate(8 + 8 + 8);
             rsp[0] = rc;
             rc.putInt(type);
-            
+
             long ledgerId = packet.getLong();
             long entryId = packet.getLong();
             LOG.debug("Received new read request: " + ledgerId + ", " + entryId);
@@ -138,17 +146,17 @@ public class BookieServer implements NIOServerFactory.PacketProcessor, WriteCall
                 rsp[1] = bookie.readEntry(ledgerId, entryId);
                 LOG.debug("##### Read entry ##### " + rsp[1].remaining());
                 rc.putInt(BookieProtocol.EOK);
-            } catch(Bookie.NoLedgerException e) {
+            } catch (Bookie.NoLedgerException e) {
                 if (LOG.isTraceEnabled()) {
                     LOG.error("Error reading " + entryId + "@" + ledgerId, e);
                 }
                 rc.putInt(BookieProtocol.ENOLEDGER);
-            } catch(Bookie.NoEntryException e) {
+            } catch (Bookie.NoEntryException e) {
                 if (LOG.isTraceEnabled()) {
                     LOG.error("Error reading " + entryId + "@" + ledgerId, e);
                 }
                 rc.putInt(BookieProtocol.ENOENTRY);
-            } catch(IOException e) {
+            } catch (IOException e) {
                 if (LOG.isTraceEnabled()) {
                     LOG.error("Error reading " + entryId + "@" + ledgerId, e);
                 }
@@ -178,12 +186,12 @@ public class BookieServer implements NIOServerFactory.PacketProcessor, WriteCall
             badType.putInt(type);
             badType.putInt(BookieProtocol.EBADREQ);
             badType.flip();
-            src.sendResponse(new ByteBuffer[] {packet});
+            src.sendResponse(new ByteBuffer[] { packet });
         }
     }
-    
-    public void writeComplete(int rc, long ledgerId, long entryId, Object ctx) {
-        Cnxn src = (Cnxn)ctx;
+
+    public void writeComplete(int rc, long ledgerId, long entryId, InetSocketAddress addr, Object ctx) {
+        Cnxn src = (Cnxn) ctx;
         ByteBuffer bb = ByteBuffer.allocate(24);
         bb.putInt(BookieProtocol.ADDENTRY);
         bb.putInt(rc);
@@ -193,7 +201,7 @@ public class BookieServer implements NIOServerFactory.PacketProcessor, WriteCall
         if (LOG.isTraceEnabled()) {
             LOG.trace("Add entry rc = " + rc + " for " + entryId + "@" + ledgerId);
         }
-        src.sendResponse(new ByteBuffer[] {bb});
+        src.sendResponse(new ByteBuffer[] { bb });
     }
-
+    
 }

+ 57 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/BookkeeperInternalCallbacks.java

@@ -0,0 +1,57 @@
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+
+package org.apache.bookkeeper.proto;
+
+import java.net.InetSocketAddress;
+import org.jboss.netty.buffer.ChannelBuffer;
+
+/**
+ * Declaration of a callback interfaces used in bookkeeper client library but
+ * not exposed to the client application.
+ */
+
+public class BookkeeperInternalCallbacks {
+    /**
+     * Callback for calls from BookieClient objects. Such calls are for replies
+     * of write operations (operations to add an entry to a ledger).
+     * 
+     */
+
+    public interface WriteCallback {
+        void writeComplete(int rc, long ledgerId, long entryId, InetSocketAddress addr, Object ctx);
+    }
+
+    public interface GenericCallback<T> {
+        void operationComplete(int rc, T result);
+    }
+    
+    /**
+     * Declaration of a callback implementation for calls from BookieClient objects.
+     * Such calls are for replies of read operations (operations to read an entry
+     * from a ledger).
+     * 
+     */
+
+    public interface ReadEntryCallback {
+        void readEntryComplete(int rc, long ledgerId, long entryId, ChannelBuffer buffer, Object ctx);
+    }
+}

+ 28 - 38
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/NIOServerFactory.java

@@ -45,8 +45,9 @@ public class NIOServerFactory extends Thread {
     public interface PacketProcessor {
         public void processPacket(ByteBuffer packet, Cnxn src);
     }
-    ServerStats stats = new ServerStats();
     
+    ServerStats stats = new ServerStats();
+
     Logger LOG = Logger.getLogger(NIOServerFactory.class);
 
     ServerSocketChannel ss;
@@ -89,6 +90,7 @@ public class NIOServerFactory extends Thread {
         }
     }
 
+    @Override
     public void run() {
         while (!ss.socket().isClosed()) {
             try {
@@ -97,16 +99,13 @@ public class NIOServerFactory extends Thread {
                 synchronized (this) {
                     selected = selector.selectedKeys();
                 }
-                ArrayList<SelectionKey> selectedList = new ArrayList<SelectionKey>(
-                        selected);
+                ArrayList<SelectionKey> selectedList = new ArrayList<SelectionKey>(selected);
                 Collections.shuffle(selectedList);
                 for (SelectionKey k : selectedList) {
                     if ((k.readyOps() & SelectionKey.OP_ACCEPT) != 0) {
-                        SocketChannel sc = ((ServerSocketChannel) k.channel())
-                                .accept();
+                        SocketChannel sc = ((ServerSocketChannel) k.channel()).accept();
                         sc.configureBlocking(false);
-                        SelectionKey sk = sc.register(selector,
-                                SelectionKey.OP_READ);
+                        SelectionKey sk = sc.register(selector, SelectionKey.OP_READ);
                         Cnxn cnxn = new Cnxn(sc, sk);
                         sk.attach(cnxn);
                         addCnxn(cnxn);
@@ -167,7 +166,7 @@ public class NIOServerFactory extends Thread {
     public class Cnxn {
 
         private SocketChannel sock;
-        
+
         private SelectionKey sk;
 
         boolean initialized;
@@ -183,7 +182,7 @@ public class NIOServerFactory extends Thread {
         int packetsSent;
 
         int packetsReceived;
-        
+
         void doIO(SelectionKey k) throws InterruptedException {
             try {
                 if (sock == null) {
@@ -233,8 +232,7 @@ public class NIOServerFactory extends Thread {
                                  * be copied, so we've got to slice the buffer
                                  * if it's too big.
                                  */
-                                b = (ByteBuffer) b.slice().limit(
-                                        directBuffer.remaining());
+                                b = (ByteBuffer) b.slice().limit(directBuffer.remaining());
                             }
                             /*
                              * put() is going to modify the positions of both
@@ -286,15 +284,12 @@ public class NIOServerFactory extends Thread {
                     }
                     synchronized (this) {
                         if (outgoingBuffers.size() == 0) {
-                            if (!initialized
-                                    && (sk.interestOps() & SelectionKey.OP_READ) == 0) {
+                            if (!initialized && (sk.interestOps() & SelectionKey.OP_READ) == 0) {
                                 throw new IOException("Responded to info probe");
                             }
-                            sk.interestOps(sk.interestOps()
-                                    & (~SelectionKey.OP_WRITE));
+                            sk.interestOps(sk.interestOps() & (~SelectionKey.OP_WRITE));
                         } else {
-                            sk.interestOps(sk.interestOps()
-                                    | SelectionKey.OP_WRITE);
+                            sk.interestOps(sk.interestOps() | SelectionKey.OP_WRITE);
                         }
                     }
                 }
@@ -349,9 +344,8 @@ public class NIOServerFactory extends Thread {
         }
 
         String peerName;
-        
-        public Cnxn(SocketChannel sock, SelectionKey sk)
-                throws IOException {
+
+        public Cnxn(SocketChannel sock, SelectionKey sk) throws IOException {
             this.sock = sock;
             this.sk = sk;
             sock.socket().setTcpNoDelay(true);
@@ -360,14 +354,14 @@ public class NIOServerFactory extends Thread {
             if (LOG.isTraceEnabled()) {
                 peerName = sock.socket().toString();
             }
-            
+
             lenBuffer.clear();
             incomingBuffer = lenBuffer;
         }
 
+        @Override
         public String toString() {
-            return "NIOServerCnxn object with sock = " + sock + " and sk = "
-                    + sk;
+            return "NIOServerCnxn object with sock = " + sock + " and sk = " + sk;
         }
 
         boolean closed;
@@ -437,11 +431,11 @@ public class NIOServerFactory extends Thread {
                 throw e;
             }
         }
-        
+
         private void sendBuffers(ByteBuffer bb[]) {
             ByteBuffer len = ByteBuffer.allocate(4);
             int total = 0;
-            for(int i = 0; i < bb.length; i++) {
+            for (int i = 0; i < bb.length; i++) {
                 if (bb[i] != null) {
                     total += bb[i].remaining();
                 }
@@ -452,14 +446,14 @@ public class NIOServerFactory extends Thread {
             len.putInt(total);
             len.flip();
             outgoingBuffers.add(len);
-            for(int i = 0; i < bb.length; i++) {
+            for (int i = 0; i < bb.length; i++) {
                 if (bb[i] != null) {
                     outgoingBuffers.add(bb[i]);
                 }
             }
             makeWritable(sk);
         }
-        
+
         synchronized public void sendResponse(ByteBuffer bb[]) {
             if (closed) {
                 return;
@@ -485,8 +479,8 @@ public class NIOServerFactory extends Thread {
             long packetsSent;
 
             /**
-             * The number of requests that have been submitted but not yet responded
-             * to.
+             * The number of requests that have been submitted but not yet
+             * responded to.
              */
             public long getOutstandingRequests() {
                 return outstandingRequests;
@@ -500,19 +494,15 @@ public class NIOServerFactory extends Thread {
                 return packetsSent;
             }
 
+            @Override
             public String toString() {
                 StringBuilder sb = new StringBuilder();
                 Channel channel = sk.channel();
                 if (channel instanceof SocketChannel) {
-                    sb.append(" ").append(
-                            ((SocketChannel) channel).socket()
-                                    .getRemoteSocketAddress()).append("[")
-                            .append(Integer.toHexString(sk.interestOps()))
-                            .append("](queued=").append(
-                                    getOutstandingRequests())
-                            .append(",recved=").append(getPacketsReceived())
-                            .append(",sent=").append(getPacketsSent()).append(
-                                    ")\n");
+                    sb.append(" ").append(((SocketChannel) channel).socket().getRemoteSocketAddress()).append("[")
+                            .append(Integer.toHexString(sk.interestOps())).append("](queued=").append(
+                                    getOutstandingRequests()).append(",recved=").append(getPacketsReceived()).append(
+                                    ",sent=").append(getPacketsSent()).append(")\n");
                 }
                 return sb.toString();
             }

+ 570 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java

@@ -0,0 +1,570 @@
+package org.apache.bookkeeper.proto;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.ArrayDeque;
+import java.util.Queue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.bookkeeper.client.BKException;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.GenericCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.WriteCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.ReadEntryCallback;
+import org.apache.bookkeeper.util.OrderedSafeExecutor;
+import org.apache.bookkeeper.util.SafeRunnable;
+import org.apache.log4j.Logger;
+import org.jboss.netty.bootstrap.ClientBootstrap;
+import org.jboss.netty.buffer.ChannelBuffer;
+import org.jboss.netty.buffer.ChannelBuffers;
+import org.jboss.netty.channel.Channel;
+import org.jboss.netty.channel.ChannelFactory;
+import org.jboss.netty.channel.ChannelFuture;
+import org.jboss.netty.channel.ChannelFutureListener;
+import org.jboss.netty.channel.ChannelHandlerContext;
+import org.jboss.netty.channel.ChannelPipeline;
+import org.jboss.netty.channel.ChannelPipelineCoverage;
+import org.jboss.netty.channel.ChannelPipelineFactory;
+import org.jboss.netty.channel.ChannelStateEvent;
+import org.jboss.netty.channel.Channels;
+import org.jboss.netty.channel.ExceptionEvent;
+import org.jboss.netty.channel.MessageEvent;
+import org.jboss.netty.channel.SimpleChannelHandler;
+import org.jboss.netty.channel.socket.ClientSocketChannelFactory;
+import org.jboss.netty.handler.codec.frame.CorruptedFrameException;
+import org.jboss.netty.handler.codec.frame.LengthFieldBasedFrameDecoder;
+import org.jboss.netty.handler.codec.frame.TooLongFrameException;
+
+/**
+ * This class manages all details of connection to a particular bookie. It also
+ * has reconnect logic if a connection to a bookie fails.
+ * 
+ */
+
+@ChannelPipelineCoverage("one")
+public class PerChannelBookieClient extends SimpleChannelHandler implements ChannelPipelineFactory {
+
+    static final Logger LOG = Logger.getLogger(PerChannelBookieClient.class);
+
+    static final long maxMemory = Runtime.getRuntime().maxMemory() / 5;
+    public static int MAX_FRAME_LENGTH = 2 * 1024 * 1024; // 2M
+
+    InetSocketAddress addr;
+    boolean connected = false;
+    AtomicLong totalBytesOutstanding;
+    ClientSocketChannelFactory channelFactory;
+    OrderedSafeExecutor executor;
+
+    ConcurrentHashMap<CompletionKey, AddCompletion> addCompletions = new ConcurrentHashMap<CompletionKey, AddCompletion>();
+    ConcurrentHashMap<CompletionKey, ReadCompletion> readCompletions = new ConcurrentHashMap<CompletionKey, ReadCompletion>();
+
+    /**
+     * The following member variables do not need to be concurrent, or volatile
+     * because they are always updated under a lock
+     */
+    Queue<GenericCallback<Void>> pendingOps = new ArrayDeque<GenericCallback<Void>>();
+    boolean connectionAttemptInProgress;
+    Channel channel = null;
+
+    public PerChannelBookieClient(OrderedSafeExecutor executor, ClientSocketChannelFactory channelFactory,
+            InetSocketAddress addr, AtomicLong totalBytesOutstanding) {
+        this.addr = addr;
+        this.executor = executor;
+        this.totalBytesOutstanding = totalBytesOutstanding;
+        this.channelFactory = channelFactory;
+        connect(channelFactory);
+    }
+
+    void connect(ChannelFactory channelFactory) {
+
+        if (LOG.isDebugEnabled())
+            LOG.debug("Connecting to bookie: " + addr);
+
+        // Set up the ClientBootStrap so we can create a new Channel connection
+        // to the bookie.
+        ClientBootstrap bootstrap = new ClientBootstrap(channelFactory);
+        bootstrap.setPipelineFactory(this);
+        bootstrap.setOption("tcpNoDelay", true);
+        bootstrap.setOption("keepAlive", true);
+
+        // Start the connection attempt to the input server host.
+        connectionAttemptInProgress = true;
+
+        ChannelFuture future = bootstrap.connect(addr);
+
+        future.addListener(new ChannelFutureListener() {
+            @Override
+            public void operationComplete(ChannelFuture future) throws Exception {
+                int rc;
+                Queue<GenericCallback<Void>> oldPendingOps;
+
+                synchronized (PerChannelBookieClient.this) {
+
+                    if (future.isSuccess()) {
+                        LOG.info("Successfully connected to bookie: " + addr);
+                        rc = BKException.Code.OK;
+                        channel = future.getChannel();
+                        connected = true;
+                    } else {
+                        LOG.error("Could not connect to bookie: " + addr);
+                        rc = BKException.Code.BookieHandleNotAvailableException;
+                        channel = null;
+                        connected = false;
+                    }
+
+                    connectionAttemptInProgress = false;
+                    PerChannelBookieClient.this.channel = channel;
+
+                    // trick to not do operations under the lock, take the list
+                    // of pending ops and assign it to a new variable, while
+                    // emptying the pending ops by just assigning it to a new
+                    // list
+                    oldPendingOps = pendingOps;
+                    pendingOps = new ArrayDeque<GenericCallback<Void>>();
+                }
+
+                for (GenericCallback<Void> pendingOp : oldPendingOps) {
+                    pendingOp.operationComplete(rc, null);
+                }
+
+            }
+        });
+    }
+
+    void connectIfNeededAndDoOp(GenericCallback<Void> op) {
+        boolean doOpNow;
+
+        // common case without lock first
+        if (channel != null && connected) {
+            doOpNow = true;
+        } else {
+
+            synchronized (this) {
+                // check again under lock
+                if (channel != null && connected) {
+                    doOpNow = true;
+                } else {
+
+                    // if reached here, channel is either null (first connection
+                    // attempt),
+                    // or the channel is disconnected
+                    doOpNow = false;
+
+                    // connection attempt is still in progress, queue up this
+                    // op. Op will be executed when connection attempt either
+                    // fails
+                    // or
+                    // succeeds
+                    pendingOps.add(op);
+
+                    if (!connectionAttemptInProgress) {
+                        connect(channelFactory);
+                    }
+
+                }
+            }
+        }
+
+        if (doOpNow) {
+            op.operationComplete(BKException.Code.OK, null);
+        }
+
+    }
+
+    /**
+     * This method should be called only after connection has been checked for
+     * {@link #connectIfNeededAndDoOp(GenericCallback)}
+     * 
+     * @param ledgerId
+     * @param masterKey
+     * @param entryId
+     * @param lastConfirmed
+     * @param macCode
+     * @param data
+     * @param cb
+     * @param ctx
+     */
+    void addEntry(final long ledgerId, byte[] masterKey, final long entryId, ChannelBuffer toSend, WriteCallback cb,
+            Object ctx) {
+
+        final int entrySize = toSend.readableBytes();
+        // if (totalBytesOutstanding.get() > maxMemory) {
+        // // TODO: how to throttle, throw an exception, or call the callback?
+        // // Maybe this should be done at the layer above?
+        // }
+
+        final CompletionKey completionKey = new CompletionKey(ledgerId, entryId);
+
+        addCompletions.put(completionKey, new AddCompletion(cb, entrySize, ctx));
+
+        int totalHeaderSize = 4 // for the length of the packet
+        + 4 // for the type of request
+        + masterKey.length; // for the master key
+
+        ChannelBuffer header = channel.getConfig().getBufferFactory().getBuffer(totalHeaderSize);
+        header.writeInt(totalHeaderSize - 4 + entrySize);
+        header.writeInt(BookieProtocol.ADDENTRY);
+        header.writeBytes(masterKey);
+
+        ChannelBuffer wrappedBuffer = ChannelBuffers.wrappedBuffer(header, toSend);
+
+        ChannelFuture future = channel.write(wrappedBuffer);
+        future.addListener(new ChannelFutureListener() {
+            @Override
+            public void operationComplete(ChannelFuture future) throws Exception {
+                if (future.isSuccess()) {
+                    if (LOG.isDebugEnabled()) {
+                        LOG.debug("Successfully wrote request for adding entry: " + entryId + " ledger-id: " + ledgerId
+                                + " bookie: " + channel.getRemoteAddress() + " entry length: " + entrySize);
+                    }
+                    // totalBytesOutstanding.addAndGet(entrySize);
+                } else {
+                    errorOutAddKey(completionKey);
+                }
+            }
+        });
+
+    }
+
+    public void readEntry(final long ledgerId, final long entryId, ReadEntryCallback cb, Object ctx) {
+
+        final CompletionKey key = new CompletionKey(ledgerId, entryId);
+        readCompletions.put(key, new ReadCompletion(cb, ctx));
+
+        int totalHeaderSize = 4 // for the length of the packet
+        + 4 // for request type
+        + 8 // for ledgerId
+        + 8; // for entryId
+
+        ChannelBuffer tmpEntry = channel.getConfig().getBufferFactory().getBuffer(totalHeaderSize);
+        tmpEntry.writeInt(totalHeaderSize - 4);
+        tmpEntry.writeInt(BookieProtocol.READENTRY);
+        tmpEntry.writeLong(ledgerId);
+        tmpEntry.writeLong(entryId);
+
+        ChannelFuture future = channel.write(tmpEntry);
+        future.addListener(new ChannelFutureListener() {
+            @Override
+            public void operationComplete(ChannelFuture future) throws Exception {
+                if (future.isSuccess()) {
+                    if (LOG.isDebugEnabled()) {
+                        LOG.debug("Successfully wrote request for reading entry: " + entryId + " ledger-id: "
+                                + ledgerId + " bookie: " + channel.getRemoteAddress());
+                    }
+                } else {
+                    errorOutReadKey(key);
+                }
+            }
+        });
+
+    }
+
+    public void close() {
+        if (channel != null) {
+            channel.close();
+        }
+    }
+
+    void errorOutReadKey(final CompletionKey key) {
+        executor.submitOrdered(key.ledgerId, new SafeRunnable() {
+            @Override
+            public void safeRun() {
+
+                ReadCompletion readCompletion = readCompletions.remove(key);
+
+                if (readCompletion != null) {
+                    LOG.error("Could not write  request for reading entry: " + key.entryId + " ledger-id: "
+                            + key.ledgerId + " bookie: " + channel.getRemoteAddress());
+
+                    readCompletion.cb.readEntryComplete(BKException.Code.BookieHandleNotAvailableException,
+                            key.ledgerId, key.entryId, null, readCompletion.ctx);
+                }
+            }
+
+        });
+    }
+
+    void errorOutAddKey(final CompletionKey key) {
+        executor.submitOrdered(key.ledgerId, new SafeRunnable() {
+            @Override
+            public void safeRun() {
+
+                AddCompletion addCompletion = addCompletions.remove(key);
+
+                if (addCompletion != null) {
+                    String bAddress = "null";
+                    if(channel != null)
+                        bAddress = channel.getRemoteAddress().toString();
+                    LOG.error("Could not write request for adding entry: " + key.entryId + " ledger-id: "
+                            + key.ledgerId + " bookie: " + bAddress);
+
+                    addCompletion.cb.writeComplete(BKException.Code.BookieHandleNotAvailableException, key.ledgerId,
+                            key.entryId, addr, addCompletion.ctx);
+                    LOG.error("Invoked callback method: " + key.entryId);
+                }
+            }
+
+        });
+
+    }
+
+    /**
+     * Errors out pending entries. We call this method from one thread to avoid
+     * concurrent executions to QuorumOpMonitor (implements callbacks). It seems
+     * simpler to call it from BookieHandle instead of calling directly from
+     * here.
+     */
+
+    void errorOutOutstandingEntries() {
+
+        // DO NOT rewrite these using Map.Entry iterations. We want to iterate
+        // on keys and see if we are successfully able to remove the key from
+        // the map. Because the add and the read methods also do the same thing
+        // in case they get a write failure on the socket. The one who
+        // successfully removes the key from the map is the one responsible for
+        // calling the application callback.
+
+        for (CompletionKey key : addCompletions.keySet()) {
+            errorOutAddKey(key);
+        }
+
+        for (CompletionKey key : readCompletions.keySet()) {
+            errorOutReadKey(key);
+        }
+    }
+
+    /**
+     * In the netty pipeline, we need to split packets based on length, so we
+     * use the {@link LengthFieldBasedFrameDecoder}. Other than that all actions
+     * are carried out in this class, e.g., making sense of received messages,
+     * prepending the length to outgoing packets etc.
+     */
+    @Override
+    public ChannelPipeline getPipeline() throws Exception {
+        ChannelPipeline pipeline = Channels.pipeline();
+        pipeline.addLast("lengthbasedframedecoder", new LengthFieldBasedFrameDecoder(MAX_FRAME_LENGTH, 0, 4, 0, 4));
+        pipeline.addLast("mainhandler", this);
+        return pipeline;
+    }
+
+    /**
+     * If our channel has disconnected, we just error out the pending entries
+     */
+    @Override
+    public void channelDisconnected(ChannelHandlerContext ctx, ChannelStateEvent e) throws Exception {
+        LOG.info("Disconnected from bookie: " + addr);
+    	errorOutOutstandingEntries();
+        channel.close();
+
+        connected = false;
+
+        // we don't want to reconnect right away. If someone sends a request to
+        // this address, we will reconnect.
+    }
+
+    /**
+     * Called by netty when an exception happens in one of the netty threads
+     * (mostly due to what we do in the netty threads)
+     */
+    @Override
+    public void exceptionCaught(ChannelHandlerContext ctx, ExceptionEvent e) throws Exception {
+        Throwable t = e.getCause();
+        if (t instanceof CorruptedFrameException || t instanceof TooLongFrameException) {
+            LOG.error("Corrupted fram recieved from bookie: " + e.getChannel().getRemoteAddress());
+            return;
+        }
+        if (t instanceof IOException) {
+            // these are thrown when a bookie fails, logging them just pollutes
+            // the logs (the failure is logged from the listeners on the write
+            // operation), so I'll just ignore it here.
+            return;
+        }
+
+        LOG.fatal("Unexpected exception caught by bookie client channel handler", t);
+        // Since we are a library, cant terminate App here, can we?
+    }
+
+    /**
+     * Called by netty when a message is received on a channel
+     */
+    @Override
+    public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Exception {
+        if (!(e.getMessage() instanceof ChannelBuffer)) {
+            ctx.sendUpstream(e);
+            return;
+        }
+
+        final ChannelBuffer buffer = (ChannelBuffer) e.getMessage();
+        final int type, rc;
+        final long ledgerId, entryId;
+
+        try {
+            type = buffer.readInt();
+            rc = buffer.readInt();
+            ledgerId = buffer.readLong();
+            entryId = buffer.readLong();
+        } catch (IndexOutOfBoundsException ex) {
+            LOG.error("Unparseable response from bookie: " + addr, ex);
+            return;
+        }
+
+        executor.submitOrdered(ledgerId, new SafeRunnable() {
+            @Override
+            public void safeRun() {
+                switch (type) {
+                case BookieProtocol.ADDENTRY:
+                    handleAddResponse(ledgerId, entryId, rc);
+                    break;
+                case BookieProtocol.READENTRY:
+                    handleReadResponse(ledgerId, entryId, rc, buffer);
+                    break;
+                default:
+                    LOG.error("Unexpected response, type: " + type + " recieved from bookie: " + addr + " , ignoring");
+                }
+            }
+
+        });
+    }
+
+    void handleAddResponse(long ledgerId, long entryId, int rc) {
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("Got response for add request from bookie: " + addr + " for ledger: " + ledgerId + " entry: "
+                    + entryId + " rc: " + rc);
+        }
+
+        // convert to BKException code because thats what the uppper
+        // layers expect. This is UGLY, there should just be one set of
+        // error codes.
+        if (rc != BookieProtocol.EOK) {
+            LOG.error("Add for ledger: " + ledgerId + ", entry: " + entryId + " failed on bookie: " + addr
+                    + " with code: " + rc);
+            rc = BKException.Code.WriteException;
+        } else {
+            rc = BKException.Code.OK;
+        }
+
+        AddCompletion ac;
+        ac = addCompletions.remove(new CompletionKey(ledgerId, entryId));
+        if (ac == null) {
+            LOG.error("Unexpected add response received from bookie: " + addr + " for ledger: " + ledgerId
+                    + ", entry: " + entryId + " , ignoring");
+            return;
+        }
+
+        // totalBytesOutstanding.addAndGet(-ac.size);
+
+        ac.cb.writeComplete(rc, ledgerId, entryId, addr, ac.ctx);
+
+    }
+
+    void handleReadResponse(long ledgerId, long entryId, int rc, ChannelBuffer buffer) {
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("Got response for read request from bookie: " + addr + " for ledger: " + ledgerId + " entry: "
+                    + entryId + " rc: " + rc + "entry length: " + buffer.readableBytes());
+        }
+
+        // convert to BKException code because thats what the uppper
+        // layers expect. This is UGLY, there should just be one set of
+        // error codes.
+        if (rc == BookieProtocol.EOK) {
+            rc = BKException.Code.OK;
+        } else if (rc == BookieProtocol.ENOENTRY || rc == BookieProtocol.ENOLEDGER) {
+            rc = BKException.Code.NoSuchEntryException;
+        } else {
+            LOG.error("Read for ledger: " + ledgerId + ", entry: " + entryId + " failed on bookie: " + addr
+                    + " with code: " + rc);
+            rc = BKException.Code.ReadException;
+        }
+
+        CompletionKey key = new CompletionKey(ledgerId, entryId);
+        ReadCompletion readCompletion = readCompletions.remove(key);
+
+        if (readCompletion == null) {
+            /*
+             * This is a special case. When recovering a ledger, a client
+             * submits a read request with id -1, and receives a response with a
+             * different entry id.
+             */
+            readCompletion = readCompletions.remove(new CompletionKey(ledgerId, -1));
+        }
+
+        if (readCompletion == null) {
+            LOG.error("Unexpected read response recieved from bookie: " + addr + " for ledger: " + ledgerId
+                    + ", entry: " + entryId + " , ignoring");
+            return;
+        }
+
+        readCompletion.cb.readEntryComplete(rc, ledgerId, entryId, buffer.slice(), readCompletion.ctx);
+    }
+
+    /**
+     * Boiler-plate wrapper classes follow
+     * 
+     */
+
+    private static class ReadCompletion {
+        final ReadEntryCallback cb;
+        final Object ctx;
+
+        public ReadCompletion(ReadEntryCallback cb, Object ctx) {
+            this.cb = cb;
+            this.ctx = ctx;
+        }
+    }
+
+    private static class AddCompletion {
+        final WriteCallback cb;
+        //final long size;
+        final Object ctx;
+
+        public AddCompletion(WriteCallback cb, long size, Object ctx) {
+            this.cb = cb;
+            //this.size = size;
+            this.ctx = ctx;
+        }
+    }
+
+    private static class CompletionKey {
+        long ledgerId;
+        long entryId;
+
+        CompletionKey(long ledgerId, long entryId) {
+            this.ledgerId = ledgerId;
+            this.entryId = entryId;
+        }
+
+        @Override
+        public boolean equals(Object obj) {
+            if (!(obj instanceof CompletionKey) || obj == null) {
+                return false;
+            }
+            CompletionKey that = (CompletionKey) obj;
+            return this.ledgerId == that.ledgerId && this.entryId == that.entryId;
+        }
+
+        @Override
+        public int hashCode() {
+            return ((int) ledgerId << 16) ^ ((int) entryId);
+        }
+
+    }
+
+}

+ 0 - 35
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/ReadEntryCallback.java

@@ -1,35 +0,0 @@
-package org.apache.bookkeeper.proto;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.nio.ByteBuffer;
-
-/**
- * Declaration of a callback implementation for calls from BookieClient
- * objects. Such calls are for replies of read operations (operations to
- * read an entry from a ledger).
- *
- */
-
-public interface ReadEntryCallback {
-    void readEntryComplete(int rc, long ledgerId, long entryId, ByteBuffer bb, Object ctx);
-}

+ 51 - 36
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/ServerStats.java

@@ -14,45 +14,51 @@
 
 package org.apache.bookkeeper.proto;
 
-
 public class ServerStats {
-    private static ServerStats instance= new ServerStats();
+    private static ServerStats instance = new ServerStats();
     private long packetsSent;
     private long packetsReceived;
     private long maxLatency;
     private long minLatency = Long.MAX_VALUE;
     private long totalLatency = 0;
     private long count = 0;
-    
-    public interface Provider{
+
+    public interface Provider {
         public long getOutstandingRequests();
+
         public long getLastProcessedZxid();
     }
-    private Provider provider=null;
-    private Object mutex=new Object();
-    
-    static public ServerStats getInstance(){
+
+    private Provider provider = null;
+    private Object mutex = new Object();
+
+    static public ServerStats getInstance() {
         return instance;
     }
+
     static public void registerAsConcrete() {
         setInstance(new ServerStats());
     }
+
     static synchronized public void unregister() {
-        instance=null;
+        instance = null;
     }
-    static synchronized protected void setInstance(ServerStats newInstance){
-        assert instance==null;
+
+    static synchronized protected void setInstance(ServerStats newInstance) {
+        assert instance == null;
         instance = newInstance;
     }
-    protected ServerStats(){}
-    
+
+    protected ServerStats() {
+    }
+
     // getters
     synchronized public long getMinLatency() {
         return (minLatency == Long.MAX_VALUE) ? 0 : minLatency;
     }
 
     synchronized public long getAvgLatency() {
-        if(count!=0)
+        if (count != 0)
             return totalLatency / count;
         return 0;
     }
@@ -62,15 +68,17 @@ public class ServerStats {
     }
 
     public long getOutstandingRequests() {
-        synchronized(mutex){
-            return (provider!=null)?provider.getOutstandingRequests():-1;
+        synchronized (mutex) {
+            return (provider != null) ? provider.getOutstandingRequests() : -1;
         }
     }
-    public long getLastProcessedZxid(){
-        synchronized(mutex){
-            return (provider!=null)?provider.getLastProcessedZxid():-1;
+
+    public long getLastProcessedZxid() {
+        synchronized (mutex) {
+            return (provider != null) ? provider.getLastProcessedZxid() : -1;
         }
     }
+
     synchronized public long getPacketsReceived() {
         return packetsReceived;
     }
@@ -79,29 +87,31 @@ public class ServerStats {
         return packetsSent;
     }
 
-    public String getServerState(){
+    public String getServerState() {
         return "standalone";
     }
-    
-    public String toString(){
+
+    @Override
+    public String toString() {
         StringBuilder sb = new StringBuilder();
-        sb.append("Latency min/avg/max: " + getMinLatency() + "/"
-                + getAvgLatency() + "/" + getMaxLatency() + "\n");
+        sb.append("Latency min/avg/max: " + getMinLatency() + "/" + getAvgLatency() + "/" + getMaxLatency() + "\n");
         sb.append("Received: " + getPacketsReceived() + "\n");
         sb.append("Sent: " + getPacketsSent() + "\n");
         if (provider != null) {
             sb.append("Outstanding: " + getOutstandingRequests() + "\n");
-            sb.append("Zxid: 0x"+ Long.toHexString(getLastProcessedZxid())+ "\n");
+            sb.append("Zxid: 0x" + Long.toHexString(getLastProcessedZxid()) + "\n");
         }
-        sb.append("Mode: "+getServerState()+"\n");
+        sb.append("Mode: " + getServerState() + "\n");
         return sb.toString();
     }
+
     // mutators
-    public void setStatsProvider(Provider zk){
-        synchronized(mutex){
-            provider=zk;
+    public void setStatsProvider(Provider zk) {
+        synchronized (mutex) {
+            provider = zk;
         }
     }
+
     synchronized void updateLatency(long requestCreateTime) {
         long latency = System.currentTimeMillis() - requestCreateTime;
         totalLatency += latency;
@@ -113,21 +123,26 @@ public class ServerStats {
             maxLatency = latency;
         }
     }
-    synchronized public void resetLatency(){
-        totalLatency=count=maxLatency=0;
-        minLatency=Long.MAX_VALUE;
+
+    synchronized public void resetLatency() {
+        totalLatency = count = maxLatency = 0;
+        minLatency = Long.MAX_VALUE;
     }
-    synchronized public void resetMaxLatency(){
-        maxLatency=getMinLatency();
+
+    synchronized public void resetMaxLatency() {
+        maxLatency = getMinLatency();
     }
+
     synchronized public void incrementPacketsReceived() {
         packetsReceived++;
     }
+
     synchronized public void incrementPacketsSent() {
         packetsSent++;
     }
-    synchronized public void resetRequestCounters(){
-        packetsReceived=packetsSent=0;
+
+    synchronized public void resetRequestCounters() {
+        packetsReceived = packetsSent = 0;
     }
 
 }

+ 0 - 32
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/proto/WriteCallback.java

@@ -1,32 +0,0 @@
-package org.apache.bookkeeper.proto;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-/**
- * Declaration of a callback implementation for calls from BookieClient
- * objects. Such calls are for replies of write operations (operations to
- * add an entry to a ledger).
- *
- */
-
-public interface WriteCallback {
-    void writeComplete(int rc, long ledgerId, long entryId, Object ctx);
-}

+ 32 - 28
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/streaming/LedgerInputStream.java

@@ -23,11 +23,11 @@ package org.apache.bookkeeper.streaming;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
+import java.util.Enumeration;
 
 import org.apache.bookkeeper.client.BKException;
 import org.apache.bookkeeper.client.LedgerEntry;
 import org.apache.bookkeeper.client.LedgerHandle;
-import org.apache.bookkeeper.client.LedgerSequence;
 import org.apache.log4j.Logger;
 
 public class LedgerInputStream extends InputStream {
@@ -35,14 +35,16 @@ public class LedgerInputStream extends InputStream {
     private LedgerHandle lh;
     private ByteBuffer bytebuff;
     byte[] bbytes;
-    long lastEntry =0;
+    long lastEntry = 0;
     int increment = 50;
     int defaultSize = 1024 * 1024; // 1MB default size
-    LedgerSequence ledgerSeq = null;
-    
+    Enumeration<LedgerEntry> ledgerSeq = null;
+
     /**
      * construct a outputstream from a ledger handle
-     * @param lh ledger handle
+     * 
+     * @param lh
+     *            ledger handle
      * @throws {@link BKException}, {@link InterruptedException}
      */
     public LedgerInputStream(LedgerHandle lh) throws BKException, InterruptedException {
@@ -50,14 +52,17 @@ public class LedgerInputStream extends InputStream {
         bbytes = new byte[defaultSize];
         this.bytebuff = ByteBuffer.wrap(bbytes);
         this.bytebuff.position(this.bytebuff.limit());
-        lastEntry = Math.max(lh.getLast(), increment);
+        lastEntry = Math.min(lh.getLastAddConfirmed(), increment);
         ledgerSeq = lh.readEntries(0, lastEntry);
     }
 
     /**
      * construct a outputstream from a ledger handle
-     * @param lh the ledger handle
-     * @param size the size of the buffer
+     * 
+     * @param lh
+     *            the ledger handle
+     * @param size
+     *            the size of the buffer
      * @throws {@link BKException}, {@link InterruptedException}
      */
     public LedgerInputStream(LedgerHandle lh, int size) throws BKException, InterruptedException {
@@ -65,38 +70,37 @@ public class LedgerInputStream extends InputStream {
         bbytes = new byte[size];
         this.bytebuff = ByteBuffer.wrap(bbytes);
         this.bytebuff.position(this.bytebuff.limit());
-        lastEntry = Math.max(lh.getLast(), increment);
+        lastEntry = Math.min(lh.getLastAddConfirmed(), increment);
         ledgerSeq = lh.readEntries(0, lastEntry);
     }
-    
-    
+
     @Override
     public void close() {
         // do nothing
-        // let the applciation
+        // let the application
         // close the ledger
     }
-    
+
     /**
-     * refill the buffer, we 
-     * need to read more bytes
+     * refill the buffer, we need to read more bytes
+     * 
      * @return if we can refill or not
      */
     private synchronized boolean refill() throws IOException {
         bytebuff.clear();
-        if (!ledgerSeq.hasMoreElements() && lastEntry >= lh.getLast()) {
+        if (!ledgerSeq.hasMoreElements() && lastEntry >= lh.getLastAddConfirmed()) {
             return false;
         }
         if (!ledgerSeq.hasMoreElements()) {
-            //do refill 
-            long last = Math.max( lastEntry + increment, lh.getLast());
+            // do refill
+            long last = Math.min(lastEntry + increment, lh.getLastAddConfirmed());
             try {
                 ledgerSeq = lh.readEntries(lastEntry + 1, last);
-            } catch(BKException bk) {
+            } catch (BKException bk) {
                 IOException ie = new IOException(bk.getMessage());
                 ie.initCause(bk);
                 throw ie;
-            } catch(InterruptedException ie) {
+            } catch (InterruptedException ie) {
                 Thread.currentThread().interrupt();
             }
             lastEntry = last;
@@ -106,7 +110,7 @@ public class LedgerInputStream extends InputStream {
         bytebuff = ByteBuffer.wrap(bbytes);
         return true;
     }
-    
+
     @Override
     public synchronized int read() throws IOException {
         boolean toread = true;
@@ -120,10 +124,10 @@ public class LedgerInputStream extends InputStream {
         }
         return -1;
     }
-    
+
     @Override
     public synchronized int read(byte[] b) throws IOException {
-        // be smart ... just copy the bytes 
+        // be smart ... just copy the bytes
         // once and return the size
         // user will call it again
         boolean toread = true;
@@ -133,19 +137,19 @@ public class LedgerInputStream extends InputStream {
         if (toread) {
             int bcopied = bytebuff.remaining();
             int tocopy = Math.min(bcopied, b.length);
-            //cannot used gets because of
+            // cannot used gets because of
             // the underflow/overflow exceptions
-            System.arraycopy(bbytes, bytebuff.position(), b,0, tocopy);
+            System.arraycopy(bbytes, bytebuff.position(), b, 0, tocopy);
             bytebuff.position(bytebuff.position() + tocopy);
             return tocopy;
         }
         return -1;
     }
-    
+
     @Override
     public synchronized int read(byte[] b, int off, int len) throws IOException {
-        //again dont need ot fully
-        // fill b, just return 
+        // again dont need ot fully
+        // fill b, just return
         // what we have and let the application call read
         // again
         boolean toread = true;

+ 32 - 33
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/streaming/LedgerOutputStream.java

@@ -28,15 +28,11 @@ import org.apache.bookkeeper.client.BKException;
 import org.apache.bookkeeper.client.LedgerHandle;
 import org.apache.log4j.Logger;
 
-
 /**
- * this class provides a streaming api 
- * to get an output stream from a ledger
- * handle and write to it as a stream of 
- * bytes. This is built on top of ledgerhandle
- * api and uses a buffer to cache the data
- * written to it and writes out the entry 
- * to the ledger.
+ * this class provides a streaming api to get an output stream from a ledger
+ * handle and write to it as a stream of bytes. This is built on top of
+ * ledgerhandle api and uses a buffer to cache the data written to it and writes
+ * out the entry to the ledger.
  */
 public class LedgerOutputStream extends OutputStream {
     Logger LOG = Logger.getLogger(LedgerOutputStream.class);
@@ -44,62 +40,66 @@ public class LedgerOutputStream extends OutputStream {
     private ByteBuffer bytebuff;
     byte[] bbytes;
     int defaultSize = 1024 * 1024; // 1MB default size
-    
+
     /**
      * construct a outputstream from a ledger handle
-     * @param lh ledger handle
+     * 
+     * @param lh
+     *            ledger handle
      */
     public LedgerOutputStream(LedgerHandle lh) {
         this.lh = lh;
         bbytes = new byte[defaultSize];
         this.bytebuff = ByteBuffer.wrap(bbytes);
     }
-    
+
     /**
      * construct a outputstream from a ledger handle
-     * @param lh the ledger handle
-     * @param size the size of the buffer
+     * 
+     * @param lh
+     *            the ledger handle
+     * @param size
+     *            the size of the buffer
      */
     public LedgerOutputStream(LedgerHandle lh, int size) {
         this.lh = lh;
         bbytes = new byte[size];
         this.bytebuff = ByteBuffer.wrap(bbytes);
     }
-    
+
     @Override
     public void close() {
-        //flush everything
+        // flush everything
         // we have
         flush();
     }
-    
+
     @Override
     public synchronized void flush() {
-        // lets flush all the data 
+        // lets flush all the data
         // into the ledger entry
         if (bytebuff.position() > 0) {
-            //copy the bytes into 
+            // copy the bytes into
             // a new byte buffer and send it out
             byte[] b = new byte[bytebuff.position()];
             LOG.info("Comment: flushing with params " + " " + bytebuff.position());
             System.arraycopy(bbytes, 0, b, 0, bytebuff.position());
             try {
                 lh.addEntry(b);
-            } catch(InterruptedException ie) {
+            } catch (InterruptedException ie) {
                 LOG.warn("Interrupted while flusing " + ie);
                 Thread.currentThread().interrupt();
-            } catch(BKException bke) {
+            } catch (BKException bke) {
                 LOG.warn("BookKeeper exception ", bke);
             }
         }
     }
-    
+
     /**
-     * make space for len bytes to be written
-     * to the buffer. 
+     * make space for len bytes to be written to the buffer.
+     * 
      * @param len
-     * @return if true then we can make space for len
-     * if false we cannot
+     * @return if true then we can make space for len if false we cannot
      */
     private boolean makeSpace(int len) {
         if (bytebuff.remaining() < len) {
@@ -111,34 +111,33 @@ public class LedgerOutputStream extends OutputStream {
         }
         return true;
     }
-    
+
     @Override
     public synchronized void write(byte[] b) {
         if (makeSpace(b.length)) {
             bytebuff.put(b);
-        }
-        else {
+        } else {
             try {
                 lh.addEntry(b);
-            } catch(InterruptedException ie) {
+            } catch (InterruptedException ie) {
                 LOG.warn("Interrupted while writing", ie);
                 Thread.currentThread().interrupt();
-            } catch(BKException bke) {
+            } catch (BKException bke) {
                 LOG.warn("BookKeeper exception", bke);
             }
         }
     }
-    
+
     @Override
     public synchronized void write(byte[] b, int off, int len) {
         if (!makeSpace(len)) {
-            //lets try making the buffer bigger
+            // lets try making the buffer bigger
             bbytes = new byte[len];
             bytebuff = ByteBuffer.wrap(bbytes);
         }
         bytebuff.put(b, off, len);
     }
-    
+
     @Override
     public synchronized void write(int b) throws IOException {
         makeSpace(1);

+ 2 - 8
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/LocalBookKeeper.java

@@ -25,9 +25,6 @@ import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.net.Socket;
 
-import org.apache.bookkeeper.client.BookKeeper;
-import org.apache.bookkeeper.client.LedgerHandle;
-import org.apache.bookkeeper.client.LedgerSequence;
 import org.apache.bookkeeper.proto.BookieServer;
 import org.apache.log4j.ConsoleAppender;
 import org.apache.log4j.Level;
@@ -40,11 +37,8 @@ import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.ZooKeeper;
 import org.apache.zookeeper.ZooDefs.Ids;
 import org.apache.zookeeper.server.NIOServerCnxn;
-import org.apache.zookeeper.server.ServerStats;
 import org.apache.zookeeper.server.ZooKeeperServer;
 
-import org.apache.log4j.Logger;
-
 public class LocalBookKeeper {
     protected static final Logger LOG = Logger.getLogger(LocalBookKeeper.class);
     public static final int CONNECTION_TIMEOUT = 30000;
@@ -98,7 +92,7 @@ public class LocalBookKeeper {
 			// TODO Auto-generated catch block
 			LOG.fatal("Exception while instantiating ZooKeeper", e);
 		} 
-		
+
         boolean b = waitForServerUp(HOSTPORT, CONNECTION_TIMEOUT);
         LOG.debug("ZooKeeper server up: " + b);
 	}
@@ -210,5 +204,5 @@ public class LocalBookKeeper {
         }
         return false;
     }
-
+	
 }

+ 6 - 7
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/Main.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.util;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,30 +21,28 @@ package org.apache.bookkeeper.util;
  * 
  */
 
-
 import java.io.IOException;
 
 import org.apache.bookkeeper.proto.BookieClient;
 import org.apache.bookkeeper.proto.BookieServer;
 
-
 public class Main {
 
     static void usage() {
         System.err.println("USAGE: bookeeper client|bookie");
     }
+
     /**
      * @param args
-     * @throws InterruptedException 
-     * @throws IOException 
+     * @throws InterruptedException
+     * @throws IOException
      */
     public static void main(String[] args) throws IOException, InterruptedException {
-        if (args.length < 1 || !(args[0].equals("client") || 
-                args[0].equals("bookie"))) {
+        if (args.length < 1 || !(args[0].equals("client") || args[0].equals("bookie"))) {
             usage();
             return;
         }
-        String newArgs[] = new String[args.length-1];
+        String newArgs[] = new String[args.length - 1];
         System.arraycopy(args, 1, newArgs, 0, newArgs.length);
         if (args[0].equals("bookie")) {
             BookieServer.main(newArgs);

+ 38 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/MathUtils.java

@@ -0,0 +1,38 @@
+package org.apache.bookkeeper.util;
+
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Provides misc math functions that dont come standard
+ */
+public class MathUtils {
+
+    public static int signSafeMod(long dividend, int divisor){
+        int mod = (int) (dividend % divisor);
+        
+        if (mod < 0){
+            mod += divisor;
+        }
+        
+        return mod;
+        
+    }
+
+}

+ 98 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/OrderedSafeExecutor.java

@@ -0,0 +1,98 @@
+package org.apache.bookkeeper.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Random;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+
+/**
+ * This class provides 2 things over the java {@link ScheduledExecutorService}.
+ * 
+ * 1. It takes {@link SafeRunnable objects} instead of plain Runnable objects.
+ * This means that exceptions in scheduled tasks wont go unnoticed and will be
+ * logged.
+ * 
+ * 2. It supports submitting tasks with an ordering key, so that tasks submitted
+ * with the same key will always be executed in order, but tasks across
+ * different keys can be unordered. This retains parallelism while retaining the
+ * basic amount of ordering we want (e.g. , per ledger handle). Ordering is
+ * achieved by hashing the key objects to threads by their {@link #hashCode()}
+ * method.
+ * 
+ */
+public class OrderedSafeExecutor {
+    ExecutorService threads[];
+    Random rand = new Random();
+
+    public OrderedSafeExecutor(int numThreads) {
+        if (numThreads <= 0) {
+            throw new IllegalArgumentException();
+        }
+
+        threads = new ExecutorService[numThreads];
+        for (int i = 0; i < numThreads; i++) {
+            threads[i] = Executors.newSingleThreadExecutor();
+        }
+    }
+
+    ExecutorService chooseThread() {
+        // skip random # generation in this special case
+        if (threads.length == 1) {
+            return threads[0];
+        }
+
+        return threads[rand.nextInt(threads.length)];
+
+    }
+
+    ExecutorService chooseThread(Object orderingKey) {
+        // skip hashcode generation in this special case
+        if (threads.length == 1) {
+            return threads[0];
+        }
+
+        return threads[MathUtils.signSafeMod(orderingKey.hashCode(), threads.length)];
+
+    }
+
+    /**
+     * schedules a one time action to execute 
+     */
+    public void submit(SafeRunnable r) {
+        chooseThread().submit(r);
+    }
+
+    /**
+     * schedules a one time action to execute with an ordering guarantee on the key
+     * @param orderingKey
+     * @param r
+     */
+    public void submitOrdered(Object orderingKey, SafeRunnable r) {
+        chooseThread(orderingKey).submit(r);
+    }
+
+    public void shutdown() {
+        for (int i = 0; i < threads.length; i++) {
+            threads[i].shutdown();
+        }
+    }
+
+}

+ 38 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/SafeRunnable.java

@@ -0,0 +1,38 @@
+package org.apache.bookkeeper.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.log4j.Logger;
+
+public abstract class SafeRunnable implements Runnable{
+
+    static final Logger logger = Logger.getLogger(SafeRunnable.class);
+    
+@Override
+    public void run() {
+        try{
+            safeRun();
+        }catch(Throwable t){
+            logger.fatal("Unexpected throwable caught ", t);
+        }
+    }
+    
+    public abstract void safeRun();
+    
+}

+ 94 - 0
src/contrib/bookkeeper/src/java/org/apache/bookkeeper/util/StringUtils.java

@@ -0,0 +1,94 @@
+package org.apache.bookkeeper.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+
+/**
+ * Provided utilites for parsing network addresses, ledger-id from node paths
+ * etc.
+ * 
+ */
+public class StringUtils {
+
+    /*
+     * Path to ledger metadata. ZooKeeper appends a sequence number to L.
+     */
+    static public final String prefix = "/ledgers/L";
+
+    /**
+     * Parses address into IP and port.
+     * 
+     * @param addr
+     *            String
+     */
+
+    public static InetSocketAddress parseAddr(String s) throws IOException {
+
+        String parts[] = s.split(":");
+        if (parts.length != 2) {
+            throw new IOException(s + " does not have the form host:port");
+        }
+        int port;
+        try {
+            port = Integer.parseInt(parts[1]);
+        } catch (NumberFormatException e) {
+            throw new IOException(s + " does not have the form host:port");
+        }
+
+        InetSocketAddress addr = new InetSocketAddress(parts[0], port);
+        return addr;
+    }
+
+    public static StringBuilder addrToString(StringBuilder sb, InetSocketAddress addr) {
+        return sb.append(addr.getAddress().getHostAddress()).append(":").append(addr.getPort());
+    }
+
+    /**
+     * Formats ledger ID according to ZooKeeper rules
+     * 
+     * @param id
+     *            znode id
+     */
+    public static String getZKStringId(long id) {
+        return String.format("%010d", id);
+    }
+
+    /**
+     * Get the path for the ledger metadata node
+     * 
+     * @return
+     */
+    public static String getLedgerNodePath(long ledgerId) {
+        return prefix + StringUtils.getZKStringId(ledgerId);
+    }
+
+    public static long getLedgerId(String nodeName) throws IOException {
+        long ledgerId;
+        try {
+            String parts[] = nodeName.split(prefix);
+            ledgerId = Long.parseLong(parts[parts.length - 1]);
+        } catch (NumberFormatException e) {
+            throw new IOException(e);
+        }
+        return ledgerId;
+    }
+
+}

+ 109 - 247
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/AsyncLedgerOpsTest.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.test;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,192 +21,158 @@ package org.apache.bookkeeper.test;
  * 
  */
 
-
-import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Enumeration;
 import java.util.Random;
 import java.util.Set;
 
 import org.apache.bookkeeper.client.AsyncCallback.AddCallback;
-import org.apache.bookkeeper.client.BKException;
-import org.apache.bookkeeper.client.BookKeeper;
+import org.apache.bookkeeper.client.LedgerEntry;
 import org.apache.bookkeeper.client.AsyncCallback.CloseCallback;
 import org.apache.bookkeeper.client.AsyncCallback.CreateCallback;
 import org.apache.bookkeeper.client.AsyncCallback.OpenCallback;
 import org.apache.bookkeeper.client.LedgerHandle;
-import org.apache.bookkeeper.client.LedgerSequence;
 import org.apache.bookkeeper.client.AsyncCallback.ReadCallback;
-import org.apache.bookkeeper.client.LedgerHandle.QMode;
-import org.apache.bookkeeper.proto.BookieServer;
-import org.apache.bookkeeper.streaming.LedgerInputStream;
-import org.apache.bookkeeper.streaming.LedgerOutputStream;
-
-import org.apache.log4j.ConsoleAppender;
-import org.apache.log4j.Level;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
 import org.apache.log4j.Logger;
-import org.apache.log4j.PatternLayout;
-import org.apache.zookeeper.CreateMode;
-import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.WatchedEvent;
-import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.ZooKeeper;
-import org.apache.zookeeper.ZooDefs.Ids;
-import org.apache.zookeeper.server.NIOServerCnxn;
-import org.apache.zookeeper.server.ZooKeeperServer;
-import org.apache.zookeeper.test.ClientBase;
-
+import org.junit.Before;
 import org.junit.Test;
-
-//import BookieReadWriteTest.SyncObj;
-//import BookieReadWriteTest.emptyWatcher;
+import org.junit.runners.Parameterized.Parameters;
 
 /**
- * This test tests read and write, synchronous and 
- * asynchronous, strings and integers for a BookKeeper client. 
- * The test deployment uses a ZooKeeper server 
- * and three BookKeepers. 
+ * This test tests read and write, synchronous and asynchronous, strings and
+ * integers for a BookKeeper client. The test deployment uses a ZooKeeper server
+ * and three BookKeepers.
  * 
  */
-
-public class AsyncLedgerOpsTest 
-    extends junit.framework.TestCase 
-    implements AddCallback, 
-    ReadCallback, 
-    CreateCallback,
-    CloseCallback,
-    OpenCallback{
+public class AsyncLedgerOpsTest extends BaseTestCase implements AddCallback, ReadCallback, CreateCallback,
+        CloseCallback, OpenCallback {
     static Logger LOG = Logger.getLogger(BookieClientTest.class);
 
-    static ConsoleAppender ca = new ConsoleAppender(new PatternLayout());
-
-    // ZooKeeper related variables
-    private static final String HOSTPORT = "127.0.0.1:2181";
-    static Integer ZooKeeperDefaultPort = 2181;
-    ZooKeeperServer zks;
-    ZooKeeper zkc; //zookeeper client
-    NIOServerCnxn.Factory serverFactory;
-    File ZkTmpDir;
+    DigestType digestType;
+    
+    public AsyncLedgerOpsTest(DigestType digestType) {
+        super(3);
+        this.digestType = digestType;
+    }
+    
+    @Parameters
+    public static Collection<Object[]> configs(){
+        return Arrays.asList(new Object[][]{ {DigestType.MAC }, {DigestType.CRC32}});
+    }
     
-    //BookKeeper 
-    File tmpDirB1, tmpDirB2, tmpDirB3;
-    BookieServer bs1, bs2, bs3;
-    Integer initialPort = 5000;
-    BookKeeper bkc; // bookkeeper client
+ 
     byte[] ledgerPassword = "aaa".getBytes();
     LedgerHandle lh, lh2;
     long ledgerId;
-    LedgerSequence ls;
-    
-    //test related variables 
+    Enumeration<LedgerEntry> ls;
+
+    // test related variables
     int numEntriesToWrite = 20;
     int maxInt = 2147483647;
-    Random rng; // Random Number Generator 
+    Random rng; // Random Number Generator
     ArrayList<byte[]> entries; // generated entries
     ArrayList<Integer> entriesSize;
-    
+
     // Synchronization
     SyncObj sync;
     Set<Object> syncObjs;
-    
+
     class SyncObj {
         int counter;
-        boolean value;      
+        boolean value;
+
         public SyncObj() {
             counter = 0;
             value = false;
-        }       
+        }
     }
-    
-    class ControlObj{
+
+    class ControlObj {
         LedgerHandle lh;
-        
-        void setLh(LedgerHandle lh){
+
+        void setLh(LedgerHandle lh) {
             this.lh = lh;
         }
-        
-        LedgerHandle getLh(){
+
+        LedgerHandle getLh() {
             return lh;
         }
     }
-    
+
     @Test
-    public void testAsyncCreateClose() throws IOException{
+    public void testAsyncCreateClose() throws IOException {
         try {
-            // Create a BookKeeper client and a ledger
-            bkc = new BookKeeper("127.0.0.1");
-           
-            ControlObj ctx = new ControlObj();
             
-            synchronized(ctx){
-                bkc.asyncCreateLedger(3, 2, 
-                    QMode.VERIFIABLE, 
-                    ledgerPassword,
-                    this,
-                    ctx);
+            ControlObj ctx = new ControlObj();
+
+            synchronized (ctx) {
+                LOG.info("Going to create ledger asynchronously");
+                bkc.asyncCreateLedger(3, 2, digestType, ledgerPassword, this, ctx);
+
                 ctx.wait();
             }
-            
-            
-            //bkc.initMessageDigest("SHA1");
+
+            // bkc.initMessageDigest("SHA1");
             LedgerHandle lh = ctx.getLh();
             ledgerId = lh.getId();
             LOG.info("Ledger ID: " + lh.getId());
-            for(int i = 0; i < numEntriesToWrite; i++){
+            for (int i = 0; i < numEntriesToWrite; i++) {
                 ByteBuffer entry = ByteBuffer.allocate(4);
                 entry.putInt(rng.nextInt(maxInt));
                 entry.position(0);
-                
+
                 entries.add(entry.array());
                 entriesSize.add(entry.array().length);
                 lh.asyncAddEntry(entry.array(), this, sync);
             }
-            
+
             // wait for all entries to be acknowledged
             synchronized (sync) {
-                if (sync.counter < numEntriesToWrite){
+                while (sync.counter < numEntriesToWrite) {
                     LOG.debug("Entries counter = " + sync.counter);
                     sync.wait();
                 }
             }
-            
+
             LOG.debug("*** WRITE COMPLETE ***");
-            // close ledger 
-            synchronized(ctx){
+            // close ledger
+            synchronized (ctx) {
                 lh.asyncClose(this, ctx);
                 ctx.wait();
             }
-            
-            //*** WRITING PART COMPLETE // READ PART BEGINS ***
-            
+
+            // *** WRITING PART COMPLETE // READ PART BEGINS ***
+
             // open ledger
-            synchronized(ctx){
-                bkc.asyncOpenLedger(ledgerId, ledgerPassword, this, ctx);
+            synchronized (ctx) {
+                bkc.asyncOpenLedger(ledgerId, digestType, ledgerPassword, this, ctx);
                 ctx.wait();
             }
             lh = ctx.getLh();
-            
-            LOG.debug("Number of entries written: " + lh.getLast());
-            assertTrue("Verifying number of entries written", lh.getLast() == numEntriesToWrite);       
-            
-            //read entries
-            lh.asyncReadEntries(0, numEntriesToWrite - 1, this, (Object) sync);
-            
+
+            LOG.debug("Number of entries written: " + lh.getLastAddConfirmed());
+            assertTrue("Verifying number of entries written", lh.getLastAddConfirmed() == (numEntriesToWrite - 1));
+
+            // read entries
+            lh.asyncReadEntries(0, numEntriesToWrite - 1, this, sync);
+
             synchronized (sync) {
-                while(sync.value == false){
+                while (sync.value == false) {
                     sync.wait();
-                }               
+                }
             }
-            
-            assertTrue("Checking number of read entries", ls.size() == numEntriesToWrite);
-            
+
             LOG.debug("*** READ COMPLETE ***");
-            
-            // at this point, LedgerSequence ls is filled with the returned values
+
+            // at this point, LedgerSequence ls is filled with the returned
+            // values
             int i = 0;
-            while(ls.hasMoreElements()){
+            while (ls.hasMoreElements()) {
                 ByteBuffer origbb = ByteBuffer.wrap(entries.get(i));
                 Integer origEntry = origbb.getInt();
                 byte[] entry = ls.nextElement().getEntry();
@@ -219,20 +186,17 @@ public class AsyncLedgerOpsTest
                 assertTrue("Checking entry " + i + " for size", entry.length == entriesSize.get(i).intValue());
                 i++;
             }
+            assertTrue("Checking number of read entries", i == numEntriesToWrite);
             lh.close();
-        } catch (KeeperException e) {
-            e.printStackTrace();
-        } catch (BKException e) {
-            e.printStackTrace();
         } catch (InterruptedException e) {
-            e.printStackTrace();
-        } //catch (NoSuchAlgorithmException e) {
-        //  e.printStackTrace();
-        //}
-        
+            LOG.error(e);
+            fail("InterruptedException");
+        } // catch (NoSuchAlgorithmException e) {
+        // e.printStackTrace();
+        // }
+
     }
-    
-    
+
     public void addComplete(int rc, LedgerHandle lh, long entryId, Object ctx) {
         SyncObj x = (SyncObj) ctx;
         synchronized (x) {
@@ -241,154 +205,52 @@ public class AsyncLedgerOpsTest
         }
     }
 
-    public void readComplete(int rc, LedgerHandle lh, LedgerSequence seq,
-            Object ctx) {
-        ls = seq;               
+    public void readComplete(int rc, LedgerHandle lh, Enumeration<LedgerEntry> seq, Object ctx) {
+        ls = seq;
         synchronized (sync) {
             sync.value = true;
             sync.notify();
         }
-        
+
     }
-    
-    public void createComplete(int rc, LedgerHandle lh, Object ctx){
-        synchronized(ctx){
+
+    public void createComplete(int rc, LedgerHandle lh, Object ctx) {
+        synchronized (ctx) {
             ControlObj cobj = (ControlObj) ctx;
             cobj.setLh(lh);
             cobj.notify();
-        }   
+        }
     }
-    
-    public void openComplete(int rc, LedgerHandle lh, Object ctx){
-        synchronized(ctx){
+
+    public void openComplete(int rc, LedgerHandle lh, Object ctx) {
+        synchronized (ctx) {
             ControlObj cobj = (ControlObj) ctx;
             cobj.setLh(lh);
             cobj.notify();
-        }   
+        }
     }
-    
-    public void closeComplete(int rc, LedgerHandle lh, Object ctx){
-        synchronized(ctx){
+
+    public void closeComplete(int rc, LedgerHandle lh, Object ctx) {
+        synchronized (ctx) {
             ControlObj cobj = (ControlObj) ctx;
             cobj.notify();
         }
     }
-     
-    protected void setUp() throws IOException {
-        LOG.addAppender(ca);
-        LOG.setLevel((Level) Level.DEBUG);
-        
-        // create a ZooKeeper server(dataDir, dataLogDir, port)
-        LOG.debug("Running ZK server");
-        //ServerStats.registerAsConcrete();
-        ClientBase.setupTestEnv();
-        ZkTmpDir = File.createTempFile("zookeeper", "test");
-        ZkTmpDir.delete();
-        ZkTmpDir.mkdir();
-            
-        try {
-            zks = new ZooKeeperServer(ZkTmpDir, ZkTmpDir, ZooKeeperDefaultPort);
-            serverFactory =  new NIOServerCnxn.Factory(ZooKeeperDefaultPort);
-            serverFactory.startup(zks);
-        } catch (IOException e1) {
-            // TODO Auto-generated catch block
-            e1.printStackTrace();
-        } catch (InterruptedException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        }
-        boolean b = ClientBase.waitForServerUp(HOSTPORT, ClientBase.CONNECTION_TIMEOUT);
-        
-        LOG.debug("Server up: " + b);
-        
-        // create a zookeeper client
-        LOG.debug("Instantiate ZK Client");
-        zkc = new ZooKeeper("127.0.0.1", ZooKeeperDefaultPort, new emptyWatcher());
-        
-        //initialize the zk client with values
-        try {
-            zkc.create("/ledgers", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort + 1), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort + 2), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-        } catch (KeeperException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        } catch (InterruptedException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        }
-        
-        // Create Bookie Servers (B1, B2, B3)
-        tmpDirB1 = File.createTempFile("bookie1", "test");
-        tmpDirB1.delete();
-        tmpDirB1.mkdir();
-         
-        bs1 = new BookieServer(initialPort, tmpDirB1, new File[]{tmpDirB1});
-        bs1.start();
-        
-        tmpDirB2 = File.createTempFile("bookie2", "test");
-        tmpDirB2.delete();
-        tmpDirB2.mkdir();
-            
-        bs2 = new BookieServer(initialPort + 1, tmpDirB2, new File[]{tmpDirB2});
-        bs2.start();
-
-        tmpDirB3 = File.createTempFile("bookie3", "test");
-        tmpDirB3.delete();
-        tmpDirB3.mkdir();
-        
-        bs3 = new BookieServer(initialPort + 2, tmpDirB3, new File[]{tmpDirB3});
-        bs3.start();
-        
-        rng = new Random(System.currentTimeMillis());   // Initialize the Random Number Generator 
-        entries = new ArrayList<byte[]>(); // initialize the  entries list
-        entriesSize = new ArrayList<Integer>(); 
+
+
+    @Before
+    @Override
+    public void setUp() throws Exception {
+        super.setUp();
+        rng = new Random(System.currentTimeMillis()); // Initialize the Random
+                                                      // Number Generator
+        entries = new ArrayList<byte[]>(); // initialize the entries list
+        entriesSize = new ArrayList<Integer>();
         sync = new SyncObj(); // initialize the synchronization data structure
     }
+
     
-    protected void tearDown(){
-        LOG.info("TearDown");
 
-        //shutdown bookie servers 
-        try {
-            bs1.shutdown();
-            bs2.shutdown();
-            bs3.shutdown();
-        } catch (InterruptedException e) {
-            e.printStackTrace();
-        }
-        cleanUpDir(tmpDirB1);
-        cleanUpDir(tmpDirB2);
-        cleanUpDir(tmpDirB3);
-        
-        //shutdown ZK server
-        serverFactory.shutdown();
-        assertTrue("waiting for server down",
-                ClientBase.waitForServerDown(HOSTPORT,
-                                             ClientBase.CONNECTION_TIMEOUT));
-        //ServerStats.unregister();
-        cleanUpDir(ZkTmpDir);
-        
-    }
 
-    /*  Clean up a directory recursively */
-    protected boolean cleanUpDir(File dir){
-        if (dir.isDirectory()) {
-            LOG.info("Cleaning up " + dir.getName());
-            String[] children = dir.list();
-            for (String string : children) {
-                boolean success = cleanUpDir(new File(dir, string));
-                if (!success) return false;
-            }
-        }
-        // The directory is now empty so delete it
-        return dir.delete();        
-    }
 
-    /*  User for testing purposes, void */
-    class emptyWatcher implements Watcher{
-        public void process(WatchedEvent event) {}
-    }
 }

+ 178 - 0
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/BaseTestCase.java

@@ -0,0 +1,178 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.bookkeeper.test;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.bookkeeper.client.BookKeeper;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
+import org.apache.bookkeeper.proto.BookieServer;
+import org.apache.log4j.Logger;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.WatchedEvent;
+import org.apache.zookeeper.Watcher;
+import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.server.NIOServerCnxn;
+import org.apache.zookeeper.server.ZooKeeperServer;
+import org.apache.zookeeper.test.ClientBase;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import junit.framework.TestCase;
+
+@RunWith(Parameterized.class)
+public abstract class BaseTestCase extends TestCase {
+    static final Logger LOG = Logger.getLogger(BaseTestCase.class);
+    // ZooKeeper related variables
+    private static final String HOSTPORT = "127.0.0.1:2181";
+    static Integer ZooKeeperDefaultPort = 2181;
+    ZooKeeperServer zks;
+    ZooKeeper zkc; // zookeeper client
+    NIOServerCnxn.Factory serverFactory;
+    File ZkTmpDir;
+
+    // BookKeeper
+    List<File> tmpDirs = new ArrayList<File>();
+    List<BookieServer> bs = new ArrayList<BookieServer>();
+    Integer initialPort = 5000;
+    int numBookies;
+    BookKeeper bkc;
+
+    public BaseTestCase(int numBookies) {
+        this.numBookies = numBookies;
+    }
+    
+    @Parameters
+    public static Collection<Object[]> configs(){
+        return Arrays.asList(new Object[][]{ {DigestType.MAC }, {DigestType.CRC32}});
+    }
+
+
+    @Before
+    @Override
+    public void setUp() throws Exception {
+        try {
+        // create a ZooKeeper server(dataDir, dataLogDir, port)
+        LOG.debug("Running ZK server");
+        // ServerStats.registerAsConcrete();
+        ClientBase.setupTestEnv();
+        ZkTmpDir = File.createTempFile("zookeeper", "test");
+        ZkTmpDir.delete();
+        ZkTmpDir.mkdir();
+
+        zks = new ZooKeeperServer(ZkTmpDir, ZkTmpDir, ZooKeeperDefaultPort);
+        serverFactory = new NIOServerCnxn.Factory(ZooKeeperDefaultPort);
+        serverFactory.startup(zks);
+
+        boolean b = ClientBase.waitForServerUp(HOSTPORT, ClientBase.CONNECTION_TIMEOUT);
+
+        LOG.debug("Server up: " + b);
+
+        // create a zookeeper client
+        LOG.debug("Instantiate ZK Client");
+        zkc = new ZooKeeper("127.0.0.1", ZooKeeperDefaultPort, new emptyWatcher());
+
+        // initialize the zk client with values
+        zkc.create("/ledgers", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
+        zkc.create("/ledgers/available", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
+        for (int i = 0; i < numBookies; i++) {
+            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort + i), new byte[0],
+                    Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
+        }
+
+        // Create Bookie Servers (B1, B2, B3)
+        for (int i = 0; i < numBookies; i++) {
+            File f = File.createTempFile("bookie", "test");
+            tmpDirs.add(f);
+            f.delete();
+            f.mkdir();
+
+            BookieServer server = new BookieServer(initialPort + i, f, new File[] { f });
+            server.start();
+            bs.add(server);
+        }
+        zkc.close();
+        bkc = new BookKeeper("127.0.0.1");
+        } catch(Exception e) {
+            e.printStackTrace();
+            throw e;
+        }
+    }
+
+    @After
+    @Override
+    public void tearDown() throws Exception {
+        LOG.info("TearDown");
+
+        if (bkc != null) {
+            bkc.halt();;
+        }
+        
+        for (BookieServer server : bs) {
+            server.shutdown();
+        }
+
+        for (File f : tmpDirs) {
+            cleanUpDir(f);
+        }
+
+        // shutdown ZK server
+        if (serverFactory != null) {
+            serverFactory.shutdown();
+            assertTrue("waiting for server down", ClientBase.waitForServerDown(HOSTPORT, ClientBase.CONNECTION_TIMEOUT));
+        }
+        // ServerStats.unregister();
+        cleanUpDir(ZkTmpDir);
+        
+
+    }
+
+    /* Clean up a directory recursively */
+    protected boolean cleanUpDir(File dir) {
+        if (dir.isDirectory()) {
+            LOG.info("Cleaning up " + dir.getName());
+            String[] children = dir.list();
+            for (String string : children) {
+                boolean success = cleanUpDir(new File(dir, string));
+                if (!success)
+                    return false;
+            }
+        }
+        // The directory is now empty so delete it
+        return dir.delete();
+    }
+
+    /* User for testing purposes, void */
+    class emptyWatcher implements Watcher {
+        public void process(WatchedEvent event) {
+        }
+    }
+
+}

+ 99 - 76
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/BookieClientTest.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.test;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,20 +21,25 @@ package org.apache.bookkeeper.test;
  * 
  */
 
-
 import java.io.File;
+import java.net.InetSocketAddress;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
+import java.util.concurrent.Executors;
 
+import org.jboss.netty.buffer.ChannelBuffer;
+import org.jboss.netty.buffer.ChannelBuffers;
+import org.jboss.netty.channel.socket.ClientSocketChannelFactory;
+import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory;
 import org.junit.Test;
+import org.apache.bookkeeper.client.BKException;
 import org.apache.bookkeeper.proto.BookieClient;
-import org.apache.bookkeeper.proto.BookieProtocol;
 import org.apache.bookkeeper.proto.BookieServer;
-import org.apache.bookkeeper.proto.ReadEntryCallback;
-import org.apache.bookkeeper.proto.WriteCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.ReadEntryCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.WriteCallback;
+import org.apache.bookkeeper.util.OrderedSafeExecutor;
 import org.apache.log4j.Logger;
 
-
 import junit.framework.TestCase;
 
 public class BookieClientTest extends TestCase {
@@ -41,57 +47,64 @@ public class BookieClientTest extends TestCase {
     BookieServer bs;
     File tmpDir;
     int port = 13645;
+    ClientSocketChannelFactory channelFactory;
+    OrderedSafeExecutor executor;
+
+    @Override
     protected void setUp() throws Exception {
         tmpDir = File.createTempFile("bookie", "test");
         tmpDir.delete();
         tmpDir.mkdir();
         bs = new BookieServer(port, tmpDir, new File[] { tmpDir });
         bs.start();
+        channelFactory = new NioClientSocketChannelFactory(Executors.newCachedThreadPool(), Executors
+                .newCachedThreadPool());
+        executor = new OrderedSafeExecutor(2);
     }
+
+    @Override
     protected void tearDown() throws Exception {
         bs.shutdown();
         recursiveDelete(tmpDir);
+        channelFactory.releaseExternalResources();
+        executor.shutdown();
     }
+
     private static void recursiveDelete(File dir) {
         File children[] = dir.listFiles();
         if (children != null) {
-            for(File child: children) {
+            for (File child : children) {
                 recursiveDelete(child);
             }
         }
         dir.delete();
     }
-    
+
     static class ResultStruct {
         int rc;
         ByteBuffer entry;
     }
+
     ReadEntryCallback recb = new ReadEntryCallback() {
 
-        public void readEntryComplete(int rc, long ledgerId, long entryId,
-                ByteBuffer bb, Object ctx) {
-            ResultStruct rs = (ResultStruct)ctx;
-            synchronized(rs) {
-                LOG.info("Capacity " + bb.capacity() + ", " + bb.position());
+        public void readEntryComplete(int rc, long ledgerId, long entryId, ChannelBuffer bb, Object ctx) {
+            ResultStruct rs = (ResultStruct) ctx;
+            synchronized (rs) {
                 rs.rc = rc;
-                bb.position(bb.position()+16);
-                //if (bb.remaining() >=4) {
-                //    // Skip the len
-                //    bb.position(bb.position()+4);
-                //}
-                rs.entry = bb.slice();
-                LOG.info("Received " + bb.remaining());
-                rs.notifyAll();
+                if (bb != null) {
+                    bb.readerIndex(16);
+                    rs.entry = bb.toByteBuffer();
+                    rs.notifyAll();
+                }
             }
         }
-        
+
     };
 
     WriteCallback wrcb = new WriteCallback() {
-        public void writeComplete(int rc, long ledgerId, long entryId,
-                Object ctx) {
+        public void writeComplete(int rc, long ledgerId, long entryId, InetSocketAddress addr, Object ctx) {
             if (ctx != null) {
-                synchronized(ctx) {
+                synchronized (ctx) {
                     ctx.notifyAll();
                 }
             }
@@ -103,104 +116,114 @@ public class BookieClientTest extends TestCase {
         final Object notifyObject = new Object();
         byte[] passwd = new byte[20];
         Arrays.fill(passwd, (byte) 'a');
-        
-        BookieClient bc = new BookieClient("127.0.0.1", port, 50000);
-        ByteBuffer bb;
-        bb = createByteBuffer(1,1,1);
-        bc.addEntry(1, passwd, 1, bb, wrcb, null);
-        bb = createByteBuffer(2,1,2);
-        bc.addEntry(1, passwd, 2, bb, wrcb, null);
-        bb = createByteBuffer(3,1,3);
-        bc.addEntry(1, passwd, 3, bb, wrcb, null);
-        bb = createByteBuffer(5,1,5);
-        bc.addEntry(1, passwd, 5, bb, wrcb, null);
-        bb = createByteBuffer(7,1,7);
-        bc.addEntry(1, passwd, 7, bb, wrcb, null);
-        synchronized(notifyObject) {
-            bb = createByteBuffer(11,1,11);
-            bc.addEntry(1, passwd, 11, bb, wrcb, notifyObject);
+        InetSocketAddress addr = new InetSocketAddress("127.0.0.1", port);
+        ResultStruct arc = new ResultStruct();
+
+        BookieClient bc = new BookieClient(channelFactory, executor);
+        ChannelBuffer bb;
+        bb = createByteBuffer(1, 1, 1);
+        bc.addEntry(addr, 1, passwd, 1, bb, wrcb, null);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 1, recb, arc);
+            arc.wait(1000);
+            assertEquals(0, arc.rc);
+            assertEquals(1, arc.entry.getInt());
+        }
+        bb = createByteBuffer(2, 1, 2);
+        bc.addEntry(addr, 1, passwd, 2, bb, wrcb, null);
+        bb = createByteBuffer(3, 1, 3);
+        bc.addEntry(addr, 1, passwd, 3, bb, wrcb, null);
+        bb = createByteBuffer(5, 1, 5);
+        bc.addEntry(addr, 1, passwd, 5, bb, wrcb, null);
+        bb = createByteBuffer(7, 1, 7);
+        bc.addEntry(addr, 1, passwd, 7, bb, wrcb, null);
+        synchronized (notifyObject) {
+            bb = createByteBuffer(11, 1, 11);
+            bc.addEntry(addr, 1, passwd, 11, bb, wrcb, notifyObject);
             notifyObject.wait();
         }
-        ResultStruct arc = new ResultStruct();
-        synchronized(arc) {
-            bc.readEntry(1, 6, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 6, recb, arc);
             arc.wait(1000);
-            assertEquals(BookieProtocol.ENOENTRY, arc.rc);
+            assertEquals(BKException.Code.NoSuchEntryException, arc.rc);
         }
-        synchronized(arc) {
-            bc.readEntry(1, 7, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 7, recb, arc);
             arc.wait(1000);
             assertEquals(0, arc.rc);
             assertEquals(7, arc.entry.getInt());
         }
-        synchronized(arc) {
-            bc.readEntry(1, 1, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 1, recb, arc);
             arc.wait(1000);
             assertEquals(0, arc.rc);
             assertEquals(1, arc.entry.getInt());
         }
-        synchronized(arc) {
-            bc.readEntry(1, 2, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 2, recb, arc);
             arc.wait(1000);
             assertEquals(0, arc.rc);
             assertEquals(2, arc.entry.getInt());
         }
-        synchronized(arc) {
-            bc.readEntry(1, 3, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 3, recb, arc);
             arc.wait(1000);
             assertEquals(0, arc.rc);
             assertEquals(3, arc.entry.getInt());
         }
-        synchronized(arc) {
-            bc.readEntry(1, 4, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 4, recb, arc);
             arc.wait(1000);
-            assertEquals(BookieProtocol.ENOENTRY, arc.rc);
+            assertEquals(BKException.Code.NoSuchEntryException, arc.rc);
         }
-        synchronized(arc) {
-            bc.readEntry(1, 11, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 11, recb, arc);
             arc.wait(1000);
             assertEquals(0, arc.rc);
             assertEquals(11, arc.entry.getInt());
         }
-        synchronized(arc) {
-            bc.readEntry(1, 5, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 5, recb, arc);
             arc.wait(1000);
             assertEquals(0, arc.rc);
             assertEquals(5, arc.entry.getInt());
         }
-        synchronized(arc) {
-            bc.readEntry(1, 10, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 10, recb, arc);
             arc.wait(1000);
-            assertEquals(BookieProtocol.ENOENTRY, arc.rc);
+            assertEquals(BKException.Code.NoSuchEntryException, arc.rc);
         }
-        synchronized(arc) {
-            bc.readEntry(1, 12, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 12, recb, arc);
             arc.wait(1000);
-            assertEquals(BookieProtocol.ENOENTRY, arc.rc);
+            assertEquals(BKException.Code.NoSuchEntryException, arc.rc);
         }
-        synchronized(arc) {
-            bc.readEntry(1, 13, recb, arc);
+        synchronized (arc) {
+            bc.readEntry(addr, 1, 13, recb, arc);
             arc.wait(1000);
-            assertEquals(BookieProtocol.ENOENTRY, arc.rc);
+            assertEquals(BKException.Code.NoSuchEntryException, arc.rc);
         }
     }
-    private ByteBuffer createByteBuffer(int i, long lid, long eid) {
+
+    private ChannelBuffer createByteBuffer(int i, long lid, long eid) {
         ByteBuffer bb;
-        bb = ByteBuffer.allocate(4+16);
-        bb.putInt(i);
+        bb = ByteBuffer.allocate(4 + 16);
         bb.putLong(lid);
         bb.putLong(eid);
+        bb.putInt(i);
         bb.flip();
-        return bb;
+        return ChannelBuffers.wrappedBuffer(bb);
     }
+
     @Test
     public void testNoLedger() throws Exception {
         ResultStruct arc = new ResultStruct();
-        BookieClient bc = new BookieClient("127.0.0.1", port, 50000);
-        synchronized(arc) {
-            bc.readEntry(2, 13, recb, arc);
+        InetSocketAddress addr = new InetSocketAddress("127.0.0.1", port);
+        BookieClient bc = new BookieClient(channelFactory, executor);
+        synchronized (arc) {
+            bc.readEntry(addr, 2, 13, recb, arc);
             arc.wait(1000);
-            assertEquals(BookieProtocol.ENOLEDGER, arc.rc);
+            assertEquals(BKException.Code.NoSuchEntryException, arc.rc);
         }
     }
 }

+ 157 - 238
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/BookieFailureTest.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.test;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,190 +21,200 @@ package org.apache.bookkeeper.test;
  * 
  */
 
-
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
 import java.util.ArrayList;
+import java.util.Enumeration;
 import java.util.Random;
 import java.util.Set;
 
 import org.apache.bookkeeper.client.AsyncCallback.AddCallback;
 import org.apache.bookkeeper.client.BKException;
 import org.apache.bookkeeper.client.BookKeeper;
+import org.apache.bookkeeper.client.LedgerEntry;
 import org.apache.bookkeeper.client.LedgerHandle;
-import org.apache.bookkeeper.client.LedgerHandle.QMode;
-import org.apache.bookkeeper.client.LedgerSequence;
 import org.apache.bookkeeper.client.AsyncCallback.ReadCallback;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
 import org.apache.bookkeeper.proto.BookieServer;
-import org.apache.bookkeeper.streaming.LedgerInputStream;
-import org.apache.bookkeeper.streaming.LedgerOutputStream;
-
-import org.apache.log4j.ConsoleAppender;
-import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
-import org.apache.log4j.PatternLayout;
-import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.ZooKeeper;
-import org.apache.zookeeper.ZooDefs.Ids;
-import org.apache.zookeeper.server.NIOServerCnxn;
-import org.apache.zookeeper.server.ZooKeeperServer;
-import org.apache.zookeeper.test.ClientBase;
-
+import org.junit.Before;
 import org.junit.Test;
 
 /**
- * This test tests read and write, synchronous and 
- * asynchronous, strings and integers for a BookKeeper client. 
- * The test deployment uses a ZooKeeper server 
- * and three BookKeepers. 
+ * This test tests read and write, synchronous and asynchronous, strings and
+ * integers for a BookKeeper client. The test deployment uses a ZooKeeper server
+ * and three BookKeepers.
  * 
  */
 
-public class BookieFailureTest 
-    extends junit.framework.TestCase 
-    implements AddCallback, ReadCallback{
-    
+public class BookieFailureTest extends BaseTestCase implements AddCallback, ReadCallback {
 
-    //Depending on the taste, select the amount of logging
+    // Depending on the taste, select the amount of logging
     // by decommenting one of the two lines below
-    //static Logger LOG = Logger.getRootLogger();
-    static Logger LOG = Logger.getLogger(BookieReadWriteTest.class);
-
-    static ConsoleAppender ca = new ConsoleAppender(new PatternLayout());
-
-    // ZooKeeper related variables
-    private static final String HOSTPORT = "127.0.0.1:2181";
-    static Integer ZooKeeperDefaultPort = 2181;
-    ZooKeeperServer zks;
-    ZooKeeper zkc; //zookeeper client
-    NIOServerCnxn.Factory serverFactory;
-    File ZkTmpDir;
-    
-    //BookKeeper 
-    File tmpDirB1, tmpDirB2, tmpDirB3, tmpDirB4;
-    BookieServer bs1, bs2, bs3, bs4;
-    Integer initialPort = 5000;
-    BookKeeper bkc; // bookkeeper client
+    // static Logger LOG = Logger.getRootLogger();
+    static Logger LOG = Logger.getLogger(BookieFailureTest.class);
+
     byte[] ledgerPassword = "aaa".getBytes();
     LedgerHandle lh, lh2;
     long ledgerId;
-    LedgerSequence ls;
-    
-    //test related variables 
-    int numEntriesToWrite = 20000;
+    Enumeration<LedgerEntry> ls;
+
+    // test related variables
+    int numEntriesToWrite = 200;
     int maxInt = 2147483647;
-    Random rng; // Random Number Generator 
+    Random rng; // Random Number Generator
     ArrayList<byte[]> entries; // generated entries
     ArrayList<Integer> entriesSize;
+    DigestType digestType;
     
     // Synchronization
     SyncObj sync;
     Set<Object> syncObjs;
-    
+
     class SyncObj {
         int counter;
-        boolean value;      
+        boolean value;
+
         public SyncObj() {
             counter = 0;
             value = false;
-        }       
+        }
+    }
+
+    public BookieFailureTest(DigestType digestType) {
+        super(4);
+        this.digestType = digestType;        
     }
     
-   /**
-    * Tests writes and reads when a bookie fails.
-    *  
-    * @throws {@link IOException}
-    */
+    /**
+     * Tests writes and reads when a bookie fails.
+     * 
+     * @throws {@link IOException}
+     */
     @Test
-    public void testAsyncBK1() throws IOException{ 
+    public void testAsyncBK1() throws IOException {
         LOG.info("#### BK1 ####");
-        auxTestReadWriteAsyncSingleClient(bs1);
+        auxTestReadWriteAsyncSingleClient(bs.get(0));
     }
-   
-   @Test
-   public void testAsyncBK2() throws IOException{    
-       LOG.info("#### BK2 ####");
-       auxTestReadWriteAsyncSingleClient(bs2);
-   }
-   
-   @Test
-   public void testAsyncBK3() throws IOException{    
-       LOG.info("#### BK3 ####"); 
-       auxTestReadWriteAsyncSingleClient(bs3);
-   }
-   
-   @Test
-   public void testAsyncBK4() throws IOException{
-       LOG.info("#### BK4 ####");
-        auxTestReadWriteAsyncSingleClient(bs4);
-   }
     
-    void auxTestReadWriteAsyncSingleClient(BookieServer bs) throws IOException{
+    @Test
+    public void testAsyncBK2() throws IOException {
+        LOG.info("#### BK2 ####");
+        auxTestReadWriteAsyncSingleClient(bs.get(1));
+    }
+
+    @Test
+    public void testAsyncBK3() throws IOException {
+        LOG.info("#### BK3 ####");
+        auxTestReadWriteAsyncSingleClient(bs.get(2));
+    }
+
+    @Test
+    public void testAsyncBK4() throws IOException {
+        LOG.info("#### BK4 ####");
+        auxTestReadWriteAsyncSingleClient(bs.get(3));
+    }
+    
+    @Test
+    public void testBookieRecovery() throws Exception{
+        bkc = new BookKeeper("127.0.0.1");
+        
+        //Shutdown all but 1 bookie
+        bs.get(0).shutdown();
+        bs.get(1).shutdown();
+        bs.get(2).shutdown();
+        
+        byte[] passwd = "blah".getBytes();
+        LedgerHandle lh = bkc.createLedger(1, 1,digestType, passwd);
+        
+        int numEntries = 100;
+        for (int i=0; i< numEntries; i++){
+            byte[] data = (""+i).getBytes();
+            lh.addEntry(data);
+        }
+        
+        bs.get(3).shutdown();
+        BookieServer server = new BookieServer(initialPort + 3, tmpDirs.get(3), new File[] { tmpDirs.get(3)});
+        server.start();
+        bs.set(3, server);
+
+        assertEquals(numEntries - 1 , lh.getLastAddConfirmed());
+        Enumeration<LedgerEntry> entries = lh.readEntries(0, lh.getLastAddConfirmed());
+        
+        int numScanned = 0;
+        while (entries.hasMoreElements()){
+            assertEquals((""+numScanned), new String(entries.nextElement().getEntry()));
+            numScanned++;
+        }
+        assertEquals(numEntries, numScanned);
+        
+        
+    }
+
+    void auxTestReadWriteAsyncSingleClient(BookieServer bs) throws IOException {
         try {
             // Create a BookKeeper client and a ledger
-            bkc = new BookKeeper("127.0.0.1");
-            lh = bkc.createLedger(4, 2, QMode.VERIFIABLE, ledgerPassword);
-        
+            lh = bkc.createLedger(3, 2, digestType, ledgerPassword);
+
             ledgerId = lh.getId();
             LOG.info("Ledger ID: " + lh.getId());
-            for(int i = 0; i < numEntriesToWrite; i++){
+            for (int i = 0; i < numEntriesToWrite; i++) {
                 ByteBuffer entry = ByteBuffer.allocate(4);
                 entry.putInt(rng.nextInt(maxInt));
                 entry.position(0);
-                
+
                 entries.add(entry.array());
                 entriesSize.add(entry.array().length);
                 lh.asyncAddEntry(entry.array(), this, sync);
-                if(i == 5000){
-                    //Bookie fail
-                    bs.shutdown();
-                }
+                
             }
             
+            LOG.info("Wrote " + numEntriesToWrite + " and now going to fail bookie.");
+            // Bookie fail
+            bs.shutdown();
+
             // wait for all entries to be acknowledged
             synchronized (sync) {
-                while (sync.counter < numEntriesToWrite){
+                while (sync.counter < numEntriesToWrite) {
                     LOG.debug("Entries counter = " + sync.counter);
                     sync.wait();
                 }
             }
-            
+
             LOG.debug("*** WRITE COMPLETE ***");
-            // close ledger 
+            // close ledger
             lh.close();
-            
-            //*** WRITING PART COMPLETE // READ PART BEGINS ***
-            
+
+            // *** WRITING PART COMPLETE // READ PART BEGINS ***
+
             // open ledger
+            bkc.halt();
             bkc = new BookKeeper("127.0.0.1");
-            lh = bkc.openLedger(ledgerId, ledgerPassword);
-            LOG.debug("Number of entries written: " + lh.getLast());
-            assertTrue("Verifying number of entries written", lh.getLast() == (numEntriesToWrite - 1));     
-            
-            //read entries
-            
-            lh.asyncReadEntries(0, numEntriesToWrite - 1, this, (Object) sync);
-            
+            lh = bkc.openLedger(ledgerId, digestType, ledgerPassword);
+            LOG.debug("Number of entries written: " + (lh.getLastAddConfirmed() + 1));
+            assertTrue("Verifying number of entries written", lh.getLastAddConfirmed() == (numEntriesToWrite - 1));
+
+            // read entries
+
+            lh.asyncReadEntries(0, numEntriesToWrite - 1, this, sync);
+
             synchronized (sync) {
-                while(sync.value == false){
+                while (sync.value == false) {
                     sync.wait(10000);
                     assertTrue("Haven't received entries", sync.value);
-                }               
+                }
             }
-            
-            assertTrue("Checking number of read entries", ls.size() == numEntriesToWrite);
-            
+
             LOG.debug("*** READ COMPLETE ***");
-            
-            // at this point, LedgerSequence ls is filled with the returned values
+
+            // at this point, LedgerSequence ls is filled with the returned
+            // values
             int i = 0;
-            LOG.info("Size of ledger sequence: " + ls.size());
-            while(ls.hasMoreElements()){
+            while (ls.hasMoreElements()) {
                 ByteBuffer origbb = ByteBuffer.wrap(entries.get(i));
                 Integer origEntry = origbb.getInt();
                 byte[] entry = ls.nextElement().getEntry();
@@ -215,24 +226,26 @@ public class BookieFailureTest
                 assertTrue("Checking entry " + i + " for size", entry.length == entriesSize.get(i).intValue());
                 i++;
             }
-            
+
+            assertTrue("Checking number of read entries", i == numEntriesToWrite);
+
             LOG.info("Verified that entries are ok, and now closing ledger");
             lh.close();
         } catch (KeeperException e) {
+            LOG.error("Caught KeeperException", e);
             fail(e.toString());
         } catch (BKException e) {
+            LOG.error("Caught BKException", e);
             fail(e.toString());
         } catch (InterruptedException e) {
+            LOG.error("Caught InterruptedException", e);
             fail(e.toString());
-        } 
-        
+        }
+
     }
-    
-    public void addComplete(int rc, 
-            LedgerHandle lh, 
-            long entryId, 
-            Object ctx) {
-        if(rc != 0)
+
+    public void addComplete(int rc, LedgerHandle lh, long entryId, Object ctx) {
+        if (rc != 0)
             fail("Failed to write entry: " + entryId);
         SyncObj x = (SyncObj) ctx;
         synchronized (x) {
@@ -241,146 +254,52 @@ public class BookieFailureTest
         }
     }
 
-    public void readComplete(int rc, 
-            LedgerHandle lh, 
-            LedgerSequence seq,
-            Object ctx) {
-        if(rc != 0)
+    public void readComplete(int rc, LedgerHandle lh, Enumeration<LedgerEntry> seq, Object ctx) {
+        if (rc != 0)
             fail("Failed to write entry");
-        ls = seq;               
+        ls = seq;
         synchronized (sync) {
             sync.value = true;
             sync.notify();
         }
-        
+
     }
-    
-    protected void setUp() throws IOException, InterruptedException {
-        LOG.addAppender(ca);
-        LOG.setLevel((Level) Level.DEBUG);
-        
-        // create a ZooKeeper server(dataDir, dataLogDir, port)
-        LOG.debug("Running ZK server (setup)");
-        //ServerStats.registerAsConcrete();
-        ClientBase.setupTestEnv();
-        ZkTmpDir = File.createTempFile("zookeeper", "test");
-        ZkTmpDir.delete();
-        ZkTmpDir.mkdir();
-            
-        try {
-            zks = new ZooKeeperServer(ZkTmpDir, ZkTmpDir, ZooKeeperDefaultPort);
-            serverFactory =  new NIOServerCnxn.Factory(ZooKeeperDefaultPort);
-            serverFactory.startup(zks);
-        } catch (IOException e1) {
-            // TODO Auto-generated catch block
-            e1.printStackTrace();
-        } catch (InterruptedException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        }
-        boolean b = ClientBase.waitForServerUp(HOSTPORT, ClientBase.CONNECTION_TIMEOUT);
-        
-        LOG.debug("Server up: " + b);
-        
-        // create a zookeeper client
-        LOG.debug("Instantiate ZK Client");
-        zkc = new ZooKeeper("127.0.0.1", ZooKeeperDefaultPort, new emptyWatcher());
-        
-        //initialize the zk client with values
-        try {
-            zkc.create("/ledgers", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort + 1), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort + 2), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort + 3), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-        } catch (KeeperException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        } catch (InterruptedException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        }
-        
-        // Create Bookie Servers (B1, B2, B3)
-        tmpDirB1 = File.createTempFile("bookie1", "test");
-        tmpDirB1.delete();
-        tmpDirB1.mkdir();
-         
-        bs1 = new BookieServer(initialPort, tmpDirB1, new File[]{tmpDirB1});
-        bs1.start();
-        
-        tmpDirB2 = File.createTempFile("bookie2", "test");
-        tmpDirB2.delete();
-        tmpDirB2.mkdir();
-            
-        bs2 = new BookieServer(initialPort + 1, tmpDirB2, new File[]{tmpDirB2});
-        bs2.start();
 
-        tmpDirB3 = File.createTempFile("bookie3", "test");
-        tmpDirB3.delete();
-        tmpDirB3.mkdir();
-        
-        bs3 = new BookieServer(initialPort + 2, tmpDirB3, new File[]{tmpDirB3});
-        bs3.start();
-        
-        tmpDirB4 = File.createTempFile("bookie4", "test");
-        tmpDirB4.delete();
-        tmpDirB4.mkdir();
-        
-        bs4 = new BookieServer(initialPort + 3, tmpDirB4, new File[]{tmpDirB4});
-        bs4.start();
-        
-        rng = new Random(System.currentTimeMillis());   // Initialize the Random Number Generator 
-        entries = new ArrayList<byte[]>(); // initialize the  entries list
-        entriesSize = new ArrayList<Integer>(); 
+    @Before
+    @Override
+    public void setUp() throws Exception {
+        super.setUp();
+
+        rng = new Random(System.currentTimeMillis()); // Initialize the Random
+                                                      // Number Generator
+        entries = new ArrayList<byte[]>(); // initialize the entries list
+        entriesSize = new ArrayList<Integer>();
         sync = new SyncObj(); // initialize the synchronization data structure
-        
+
         zkc.close();
     }
-    
-    protected void tearDown() throws InterruptedException {
-        LOG.info("TearDown");
-        bkc.halt();
-        
-        //shutdown bookie servers 
-        if(!bs1.isDown()) bs1.shutdown();
-        if(!bs2.isDown()) bs2.shutdown();
-        if(!bs3.isDown()) bs3.shutdown();
-        if(!bs4.isDown()) bs4.shutdown();
-             
-        cleanUpDir(tmpDirB1);
-        cleanUpDir(tmpDirB2);
-        cleanUpDir(tmpDirB3);
-        cleanUpDir(tmpDirB4);
-        //shutdown ZK server
-        serverFactory.shutdown();
-        assertTrue("waiting for server down",
-                ClientBase.waitForServerDown(HOSTPORT,
-                                             ClientBase.CONNECTION_TIMEOUT));
-        //ServerStats.unregister();
-        cleanUpDir(ZkTmpDir);
-        
-    }
 
-    /*  Clean up a directory recursively */
-    protected boolean cleanUpDir(File dir){
+
+    /* Clean up a directory recursively */
+    @Override
+    protected boolean cleanUpDir(File dir) {
         if (dir.isDirectory()) {
             LOG.info("Cleaning up " + dir.getName());
             String[] children = dir.list();
             for (String string : children) {
                 boolean success = cleanUpDir(new File(dir, string));
-                if (!success) return false;
+                if (!success)
+                    return false;
             }
         }
         // The directory is now empty so delete it
-        return dir.delete();        
+        return dir.delete();
     }
 
-    /*  User for testing purposes, void */
-    class emptyWatcher implements Watcher{
-        public void process(WatchedEvent event) {}
+    /* User for testing purposes, void */
+    class emptyWatcher implements Watcher {
+        public void process(WatchedEvent event) {
+        }
     }
-    
 
 }

+ 181 - 296
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/BookieReadWriteTest.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.test;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,157 +21,132 @@ package org.apache.bookkeeper.test;
  * 
  */
 
-
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
+import java.util.Enumeration;
 import java.util.Random;
 import java.util.Set;
 
 import org.apache.bookkeeper.client.AsyncCallback.AddCallback;
 import org.apache.bookkeeper.client.BKException;
 import org.apache.bookkeeper.client.BookKeeper;
+import org.apache.bookkeeper.client.LedgerEntry;
 import org.apache.bookkeeper.client.LedgerHandle;
-import org.apache.bookkeeper.client.LedgerSequence;
 import org.apache.bookkeeper.client.AsyncCallback.ReadCallback;
-import org.apache.bookkeeper.proto.BookieServer;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
 import org.apache.bookkeeper.streaming.LedgerInputStream;
 import org.apache.bookkeeper.streaming.LedgerOutputStream;
-
-import org.apache.log4j.ConsoleAppender;
-import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
-import org.apache.log4j.PatternLayout;
-import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.ZooKeeper;
-import org.apache.zookeeper.ZooDefs.Ids;
-import org.apache.zookeeper.server.NIOServerCnxn;
-import org.apache.zookeeper.server.ZooKeeperServer;
-import org.apache.zookeeper.test.ClientBase;
-
+import org.junit.Before;
 import org.junit.Test;
 
 /**
- * This test tests read and write, synchronous and 
- * asynchronous, strings and integers for a BookKeeper client. 
- * The test deployment uses a ZooKeeper server 
- * and three BookKeepers. 
+ * This test tests read and write, synchronous and asynchronous, strings and
+ * integers for a BookKeeper client. The test deployment uses a ZooKeeper server
+ * and three BookKeepers.
  * 
  */
 
-public class BookieReadWriteTest 
-    extends junit.framework.TestCase 
-    implements AddCallback, ReadCallback{
+public class BookieReadWriteTest extends BaseTestCase implements AddCallback, ReadCallback {
 
-    //Depending on the taste, select the amount of logging
+    // Depending on the taste, select the amount of logging
     // by decommenting one of the two lines below
-    //static Logger LOG = Logger.getRootLogger();
+    // static Logger LOG = Logger.getRootLogger();
     static Logger LOG = Logger.getLogger(BookieReadWriteTest.class);
 
-    static ConsoleAppender ca = new ConsoleAppender(new PatternLayout());
-
-    // ZooKeeper related variables
-    private static final String HOSTPORT = "127.0.0.1:2181";
-    static Integer ZooKeeperDefaultPort = 2181;
-    ZooKeeperServer zks;
-    ZooKeeper zkc; //zookeeper client
-    NIOServerCnxn.Factory serverFactory;
-    File ZkTmpDir;
-    
-    //BookKeeper 
-    File tmpDirB1, tmpDirB2, tmpDirB3;
-    BookieServer bs1, bs2, bs3;
-    Integer initialPort = 5000;
-    BookKeeper bkc; // bookkeeper client
     byte[] ledgerPassword = "aaa".getBytes();
     LedgerHandle lh, lh2;
     long ledgerId;
-    LedgerSequence ls;
-    
-    //test related variables 
+    Enumeration<LedgerEntry> ls;
+
+    // test related variables
     int numEntriesToWrite = 200;
     int maxInt = 2147483647;
-    Random rng; // Random Number Generator 
+    Random rng; // Random Number Generator
     ArrayList<byte[]> entries; // generated entries
     ArrayList<Integer> entriesSize;
     
+    DigestType digestType;
+    
+    public BookieReadWriteTest(DigestType digestType){
+        super(3);
+        this.digestType = digestType;
+    }
     // Synchronization
     SyncObj sync;
     Set<Object> syncObjs;
-    
+
     class SyncObj {
         int counter;
-        boolean value;      
+        boolean value;
+
         public SyncObj() {
             counter = 0;
             value = false;
-        }       
+        }
     }
-    
+
     @Test
-    public void testOpenException() 
-    throws KeeperException, IOException, InterruptedException {
-        bkc = new BookKeeper("127.0.0.1");
-        try{
-            lh = bkc.openLedger(0, ledgerPassword);
+    public void testOpenException() throws KeeperException, IOException, InterruptedException {
+        try {
+            lh = bkc.openLedger(0, digestType, ledgerPassword);
             fail("Haven't thrown exception");
         } catch (BKException e) {
             LOG.warn("Successfully thrown and caught exception:", e);
         }
     }
-    
+
     /**
-     * test the streaming api for reading
-     * and writing
+     * test the streaming api for reading and writing
+     * 
      * @throws {@link IOException}, {@link KeeperException}
      */
     @Test
-    public void testStreamingClients() throws IOException,
-        KeeperException, BKException, InterruptedException {
+    public void testStreamingClients() throws IOException, KeeperException, BKException, InterruptedException {
         bkc = new BookKeeper("127.0.0.1");
-        lh = bkc.createLedger(ledgerPassword);
-        //write a string so that we cna
+        lh = bkc.createLedger(digestType, ledgerPassword);
+        // write a string so that we cna
         // create a buffer of a single bytes
         // and check for corner cases
-        String toWrite = "we need to check for this string to match " +
-                "and for the record mahadev is the best";
-        LedgerOutputStream lout = new LedgerOutputStream(lh , 1);
+        String toWrite = "we need to check for this string to match " + "and for the record mahadev is the best";
+        LedgerOutputStream lout = new LedgerOutputStream(lh, 1);
         byte[] b = toWrite.getBytes();
         lout.write(b);
         lout.close();
         long lId = lh.getId();
         lh.close();
-        //check for sanity
-        lh = bkc.openLedger(lId, ledgerPassword);
-        LedgerInputStream lin = new LedgerInputStream(lh,  1);
+        // check for sanity
+        lh = bkc.openLedger(lId, digestType, ledgerPassword);
+        LedgerInputStream lin = new LedgerInputStream(lh, 1);
         byte[] bread = new byte[b.length];
         int read = 0;
-        while (read < b.length) { 
+        while (read < b.length) {
             read = read + lin.read(bread, read, b.length);
         }
-        
+
         String newString = new String(bread);
         assertTrue("these two should same", toWrite.equals(newString));
         lin.close();
         lh.close();
-        //create another ledger to write one byte at a time
-        lh = bkc.createLedger(ledgerPassword);
+        // create another ledger to write one byte at a time
+        lh = bkc.createLedger(digestType, ledgerPassword);
         lout = new LedgerOutputStream(lh);
-        for (int i=0; i < b.length;i++) {
+        for (int i = 0; i < b.length; i++) {
             lout.write(b[i]);
         }
         lout.close();
         lId = lh.getId();
         lh.close();
-        lh = bkc.openLedger(lId, ledgerPassword);
+        lh = bkc.openLedger(lId, digestType, ledgerPassword);
         lin = new LedgerInputStream(lh);
         bread = new byte[b.length];
-        read= 0;
+        read = 0;
         while (read < b.length) {
             read = read + lin.read(bread, read, b.length);
         }
@@ -179,62 +155,60 @@ public class BookieReadWriteTest
         lin.close();
         lh.close();
     }
-        
-    
+
     @Test
-    public void testReadWriteAsyncSingleClient() throws IOException{
+    public void testReadWriteAsyncSingleClient() throws IOException {
         try {
             // Create a BookKeeper client and a ledger
             bkc = new BookKeeper("127.0.0.1");
-            lh = bkc.createLedger(ledgerPassword);
-            //bkc.initMessageDigest("SHA1");
+            lh = bkc.createLedger(digestType, ledgerPassword);
+            // bkc.initMessageDigest("SHA1");
             ledgerId = lh.getId();
             LOG.info("Ledger ID: " + lh.getId());
-            for(int i = 0; i < numEntriesToWrite; i++){
+            for (int i = 0; i < numEntriesToWrite; i++) {
                 ByteBuffer entry = ByteBuffer.allocate(4);
                 entry.putInt(rng.nextInt(maxInt));
                 entry.position(0);
-                
+
                 entries.add(entry.array());
                 entriesSize.add(entry.array().length);
                 lh.asyncAddEntry(entry.array(), this, sync);
             }
-            
+
             // wait for all entries to be acknowledged
             synchronized (sync) {
-                while (sync.counter < numEntriesToWrite){
+                while (sync.counter < numEntriesToWrite) {
                     LOG.debug("Entries counter = " + sync.counter);
                     sync.wait();
                 }
             }
-            
+
             LOG.debug("*** WRITE COMPLETE ***");
-            // close ledger 
+            // close ledger
             lh.close();
-            
-            //*** WRITING PART COMPLETE // READ PART BEGINS ***
-            
+
+            // *** WRITING PART COMPLETE // READ PART BEGINS ***
+
             // open ledger
-            lh = bkc.openLedger(ledgerId, ledgerPassword);
-            LOG.debug("Number of entries written: " + lh.getLast());
-            assertTrue("Verifying number of entries written", lh.getLast() == (numEntriesToWrite - 1));     
-            
-            //read entries
+            lh = bkc.openLedger(ledgerId, digestType, ledgerPassword);
+            LOG.debug("Number of entries written: " + (lh.getLastAddConfirmed() + 1));
+            assertTrue("Verifying number of entries written", lh.getLastAddConfirmed() == (numEntriesToWrite - 1));
+
+            // read entries
             lh.asyncReadEntries(0, numEntriesToWrite - 1, this, (Object) sync);
-            
+
             synchronized (sync) {
-                while(sync.value == false){
+                while (sync.value == false) {
                     sync.wait();
-                }               
+                }
             }
-            
-            assertTrue("Checking number of read entries", ls.size() == numEntriesToWrite);
-            
+
             LOG.debug("*** READ COMPLETE ***");
-            
-            // at this point, LedgerSequence ls is filled with the returned values
+
+            // at this point, LedgerSequence ls is filled with the returned
+            // values
             int i = 0;
-            while(ls.hasMoreElements()){
+            while (ls.hasMoreElements()) {
                 ByteBuffer origbb = ByteBuffer.wrap(entries.get(i));
                 Integer origEntry = origbb.getInt();
                 byte[] entry = ls.nextElement().getEntry();
@@ -248,6 +222,8 @@ public class BookieReadWriteTest
                 assertTrue("Checking entry " + i + " for size", entry.length == entriesSize.get(i).intValue());
                 i++;
             }
+            assertTrue("Checking number of read entries", i == numEntriesToWrite);
+
             lh.close();
         } catch (KeeperException e) {
             LOG.error("Test failed", e);
@@ -258,71 +234,72 @@ public class BookieReadWriteTest
         } catch (InterruptedException e) {
             LOG.error("Test failed", e);
             fail("Test failed due to interruption");
-        } 
+        }
     }
-    
+
     @Test
-    public void testSyncReadAsyncWriteStringsSingleClient() throws IOException{
+    public void testSyncReadAsyncWriteStringsSingleClient() throws IOException {
         LOG.info("TEST READ WRITE STRINGS MIXED SINGLE CLIENT");
         String charset = "utf-8";
-        LOG.debug("Default charset: "  + Charset.defaultCharset());
+        LOG.debug("Default charset: " + Charset.defaultCharset());
         try {
             // Create a BookKeeper client and a ledger
             bkc = new BookKeeper("127.0.0.1");
-            lh = bkc.createLedger(ledgerPassword);
-            //bkc.initMessageDigest("SHA1");
+            lh = bkc.createLedger(digestType, ledgerPassword);
+            // bkc.initMessageDigest("SHA1");
             ledgerId = lh.getId();
             LOG.info("Ledger ID: " + lh.getId());
-            for(int i = 0; i < numEntriesToWrite; i++){
+            for (int i = 0; i < numEntriesToWrite; i++) {
                 int randomInt = rng.nextInt(maxInt);
                 byte[] entry = new String(Integer.toString(randomInt)).getBytes(charset);
                 entries.add(entry);
                 lh.asyncAddEntry(entry, this, sync);
             }
-            
+
             // wait for all entries to be acknowledged
             synchronized (sync) {
-                while (sync.counter < numEntriesToWrite){
+                while (sync.counter < numEntriesToWrite) {
                     LOG.debug("Entries counter = " + sync.counter);
                     sync.wait();
                 }
             }
-            
+
             LOG.debug("*** ASYNC WRITE COMPLETE ***");
-            // close ledger 
+            // close ledger
             lh.close();
-            
-            //*** WRITING PART COMPLETED // READ PART BEGINS ***
-            
+
+            // *** WRITING PART COMPLETED // READ PART BEGINS ***
+
             // open ledger
-            lh = bkc.openLedger(ledgerId, ledgerPassword);
-            LOG.debug("Number of entries written: " + lh.getLast());
-            assertTrue("Verifying number of entries written", lh.getLast() == (numEntriesToWrite - 1));     
-            
-            //read entries          
+            lh = bkc.openLedger(ledgerId, digestType, ledgerPassword);
+            LOG.debug("Number of entries written: " + (lh.getLastAddConfirmed() + 1));
+            assertTrue("Verifying number of entries written", lh.getLastAddConfirmed() == (numEntriesToWrite - 1));
+
+            // read entries
             ls = lh.readEntries(0, numEntriesToWrite - 1);
-            
-            assertTrue("Checking number of read entries", ls.size() == numEntriesToWrite);
-            
+
             LOG.debug("*** SYNC READ COMPLETE ***");
-            
-            // at this point, LedgerSequence ls is filled with the returned values
+
+            // at this point, LedgerSequence ls is filled with the returned
+            // values
             int i = 0;
-            while(ls.hasMoreElements()){
+            while (ls.hasMoreElements()) {
                 byte[] origEntryBytes = entries.get(i++);
                 byte[] retrEntryBytes = ls.nextElement().getEntry();
-                
+
                 LOG.debug("Original byte entry size: " + origEntryBytes.length);
                 LOG.debug("Saved byte entry size: " + retrEntryBytes.length);
-                
+
                 String origEntry = new String(origEntryBytes, charset);
                 String retrEntry = new String(retrEntryBytes, charset);
-                
+
                 LOG.debug("Original entry: " + origEntry);
                 LOG.debug("Retrieved entry: " + retrEntry);
-                
+
                 assertTrue("Checking entry " + i + " for equality", origEntry.equals(retrEntry));
             }
+            assertTrue("Checking number of read entries", i == numEntriesToWrite);
+
             lh.close();
         } catch (KeeperException e) {
             LOG.error("Test failed", e);
@@ -333,34 +310,34 @@ public class BookieReadWriteTest
         } catch (InterruptedException e) {
             LOG.error("Test failed", e);
             fail("Test failed due to interruption");
-        } 
-        
+        }
+
     }
-    
+
     @Test
     public void testReadWriteSyncSingleClient() throws IOException {
         try {
             // Create a BookKeeper client and a ledger
             bkc = new BookKeeper("127.0.0.1");
-            lh = bkc.createLedger(ledgerPassword);
-            //bkc.initMessageDigest("SHA1");
+            lh = bkc.createLedger(digestType, ledgerPassword);
+            // bkc.initMessageDigest("SHA1");
             ledgerId = lh.getId();
             LOG.info("Ledger ID: " + lh.getId());
-            for(int i = 0; i < numEntriesToWrite; i++){
+            for (int i = 0; i < numEntriesToWrite; i++) {
                 ByteBuffer entry = ByteBuffer.allocate(4);
                 entry.putInt(rng.nextInt(maxInt));
                 entry.position(0);
-                entries.add(entry.array());             
+                entries.add(entry.array());
                 lh.addEntry(entry.array());
             }
             lh.close();
-            lh = bkc.openLedger(ledgerId, ledgerPassword);
-            LOG.debug("Number of entries written: " + lh.getLast());
-            assertTrue("Verifying number of entries written", lh.getLast() == (numEntriesToWrite - 1));     
-            
+            lh = bkc.openLedger(ledgerId, digestType, ledgerPassword);
+            LOG.debug("Number of entries written: " + lh.getLastAddConfirmed());
+            assertTrue("Verifying number of entries written", lh.getLastAddConfirmed() == (numEntriesToWrite - 1));
+
             ls = lh.readEntries(0, numEntriesToWrite - 1);
             int i = 0;
-            while(ls.hasMoreElements()){
+            while (ls.hasMoreElements()) {
                 ByteBuffer origbb = ByteBuffer.wrap(entries.get(i++));
                 Integer origEntry = origbb.getInt();
                 ByteBuffer result = ByteBuffer.wrap(ls.nextElement().getEntry());
@@ -381,42 +358,42 @@ public class BookieReadWriteTest
         } catch (InterruptedException e) {
             LOG.error("Test failed", e);
             fail("Test failed due to interruption");
-        } 
+        }
     }
-    
+
     @Test
     public void testReadWriteZero() throws IOException {
         try {
             // Create a BookKeeper client and a ledger
             bkc = new BookKeeper("127.0.0.1");
-            lh = bkc.createLedger(ledgerPassword);
-            //bkc.initMessageDigest("SHA1");
+            lh = bkc.createLedger(digestType, ledgerPassword);
+            // bkc.initMessageDigest("SHA1");
             ledgerId = lh.getId();
             LOG.info("Ledger ID: " + lh.getId());
-            for(int i = 0; i < numEntriesToWrite; i++){             
-            lh.addEntry(new byte[0]);
+            for (int i = 0; i < numEntriesToWrite; i++) {
+                lh.addEntry(new byte[0]);
             }
-            
+
             /*
              * Write a non-zero entry
              */
             ByteBuffer entry = ByteBuffer.allocate(4);
             entry.putInt(rng.nextInt(maxInt));
             entry.position(0);
-            entries.add(entry.array());             
-            lh.addEntry( entry.array());
-            
+            entries.add(entry.array());
+            lh.addEntry(entry.array());
+
             lh.close();
-            lh = bkc.openLedger(ledgerId, ledgerPassword);
-            LOG.debug("Number of entries written: " + lh.getLast());
-            assertTrue("Verifying number of entries written", lh.getLast() == numEntriesToWrite);       
-            
+            lh = bkc.openLedger(ledgerId, digestType, ledgerPassword);
+            LOG.debug("Number of entries written: " + lh.getLastAddConfirmed());
+            assertTrue("Verifying number of entries written", lh.getLastAddConfirmed() == numEntriesToWrite);
+
             ls = lh.readEntries(0, numEntriesToWrite - 1);
             int i = 0;
-            while(ls.hasMoreElements()){
+            while (ls.hasMoreElements()) {
                 ByteBuffer result = ByteBuffer.wrap(ls.nextElement().getEntry());
                 LOG.debug("Length of result: " + result.capacity());
-                
+
                 assertTrue("Checking if entry " + i + " has zero bytes", result.capacity() == 0);
             }
             lh.close();
@@ -429,52 +406,54 @@ public class BookieReadWriteTest
         } catch (InterruptedException e) {
             LOG.error("Test failed", e);
             fail("Test failed due to interruption");
-        } 
+        }
     }
-    
+
     @Test
     public void testMultiLedger() throws IOException {
         try {
             // Create a BookKeeper client and a ledger
             bkc = new BookKeeper("127.0.0.1");
-            lh = bkc.createLedger(ledgerPassword);
-            lh2 = bkc.createLedger(ledgerPassword);
-            
+            lh = bkc.createLedger(digestType, ledgerPassword);
+            lh2 = bkc.createLedger(digestType, ledgerPassword);
+
             long ledgerId = lh.getId();
             long ledgerId2 = lh2.getId();
-            
-            //bkc.initMessageDigest("SHA1");
+
+            // bkc.initMessageDigest("SHA1");
             LOG.info("Ledger ID 1: " + lh.getId() + ", Ledger ID 2: " + lh2.getId());
-            for(int i = 0; i < numEntriesToWrite; i++){             
-                lh.addEntry( new byte[0]);
+            for (int i = 0; i < numEntriesToWrite; i++) {
+                lh.addEntry(new byte[0]);
                 lh2.addEntry(new byte[0]);
             }
-            
+
             lh.close();
             lh2.close();
-                
-            lh = bkc.openLedger(ledgerId, ledgerPassword);
-            lh2 = bkc.openLedger(ledgerId2, ledgerPassword);
-            
-            LOG.debug("Number of entries written: " + lh.getLast() + ", " + lh2.getLast());
-            assertTrue("Verifying number of entries written lh (" + lh.getLast() + ")" , lh.getLast() == (numEntriesToWrite - 1));
-            assertTrue("Verifying number of entries written lh2 (" + lh2.getLast() + ")", lh2.getLast() == (numEntriesToWrite - 1));
-            
+
+            lh = bkc.openLedger(ledgerId, digestType, ledgerPassword);
+            lh2 = bkc.openLedger(ledgerId2, digestType, ledgerPassword);
+
+            LOG.debug("Number of entries written: " + lh.getLastAddConfirmed() + ", " + lh2.getLastAddConfirmed());
+            assertTrue("Verifying number of entries written lh (" + lh.getLastAddConfirmed() + ")", lh
+                    .getLastAddConfirmed() == (numEntriesToWrite - 1));
+            assertTrue("Verifying number of entries written lh2 (" + lh2.getLastAddConfirmed() + ")", lh2
+                    .getLastAddConfirmed() == (numEntriesToWrite - 1));
+
             ls = lh.readEntries(0, numEntriesToWrite - 1);
             int i = 0;
-            while(ls.hasMoreElements()){
+            while (ls.hasMoreElements()) {
                 ByteBuffer result = ByteBuffer.wrap(ls.nextElement().getEntry());
                 LOG.debug("Length of result: " + result.capacity());
-                
+
                 assertTrue("Checking if entry " + i + " has zero bytes", result.capacity() == 0);
             }
             lh.close();
-            ls = lh2.readEntries( 0, numEntriesToWrite - 1);
+            ls = lh2.readEntries(0, numEntriesToWrite - 1);
             i = 0;
-            while(ls.hasMoreElements()){
+            while (ls.hasMoreElements()) {
                 ByteBuffer result = ByteBuffer.wrap(ls.nextElement().getEntry());
                 LOG.debug("Length of result: " + result.capacity());
-                
+
                 assertTrue("Checking if entry " + i + " has zero bytes", result.capacity() == 0);
             }
             lh2.close();
@@ -487,14 +466,10 @@ public class BookieReadWriteTest
         } catch (InterruptedException e) {
             LOG.error("Test failed", e);
             fail("Test failed due to interruption");
-        } 
+        }
     }
-    
-    
-    public void addComplete(int rc, 
-            LedgerHandle lh, 
-            long entryId, 
-            Object ctx) {
+
+    public void addComplete(int rc, LedgerHandle lh, long entryId, Object ctx) {
         SyncObj x = (SyncObj) ctx;
         synchronized (x) {
             x.counter++;
@@ -502,135 +477,45 @@ public class BookieReadWriteTest
         }
     }
 
-    public void readComplete(int rc, 
-            LedgerHandle lh, 
-            LedgerSequence seq,
-            Object ctx) {
-        ls = seq;               
+    public void readComplete(int rc, LedgerHandle lh, Enumeration<LedgerEntry> seq, Object ctx) {
+        ls = seq;
         synchronized (sync) {
             sync.value = true;
             sync.notify();
         }
-        
-    }
-     
-    protected void setUp() throws IOException, InterruptedException {
-        LOG.addAppender(ca);
-        LOG.setLevel((Level) Level.DEBUG);
-        
-        // create a ZooKeeper server(dataDir, dataLogDir, port)
-        LOG.debug("Running ZK server");
-        //ServerStats.registerAsConcrete();
-        ClientBase.setupTestEnv();
-        ZkTmpDir = File.createTempFile("zookeeper", "test");
-        ZkTmpDir.delete();
-        ZkTmpDir.mkdir();
-            
-        try {
-            zks = new ZooKeeperServer(ZkTmpDir, ZkTmpDir, ZooKeeperDefaultPort);
-            serverFactory =  new NIOServerCnxn.Factory(ZooKeeperDefaultPort);
-            serverFactory.startup(zks);
-        } catch (IOException e1) {
-            // TODO Auto-generated catch block
-            e1.printStackTrace();
-        } catch (InterruptedException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        }
-        boolean b = ClientBase.waitForServerUp(HOSTPORT, ClientBase.CONNECTION_TIMEOUT);
-        
-        LOG.debug("Server up: " + b);
-        
-        // create a zookeeper client
-        LOG.debug("Instantiate ZK Client");
-        zkc = new ZooKeeper("127.0.0.1", ZooKeeperDefaultPort, new emptyWatcher());
-        
-        //initialize the zk client with values
-        try {
-            zkc.create("/ledgers", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort + 1), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zkc.create("/ledgers/available/127.0.0.1:" + Integer.toString(initialPort + 2), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-        } catch (KeeperException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        } catch (InterruptedException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        }
-        
-        // Create Bookie Servers (B1, B2, B3)
-        tmpDirB1 = File.createTempFile("bookie1", "test");
-        tmpDirB1.delete();
-        tmpDirB1.mkdir();
-         
-        bs1 = new BookieServer(initialPort, tmpDirB1, new File[]{tmpDirB1});
-        bs1.start();
-        
-        tmpDirB2 = File.createTempFile("bookie2", "test");
-        tmpDirB2.delete();
-        tmpDirB2.mkdir();
-            
-        bs2 = new BookieServer(initialPort + 1, tmpDirB2, new File[]{tmpDirB2});
-        bs2.start();
-
-        tmpDirB3 = File.createTempFile("bookie3", "test");
-        tmpDirB3.delete();
-        tmpDirB3.mkdir();
-        
-        bs3 = new BookieServer(initialPort + 2, tmpDirB3, new File[]{tmpDirB3});
-        bs3.start();
-        
-        rng = new Random(System.currentTimeMillis());   // Initialize the Random Number Generator 
-        entries = new ArrayList<byte[]>(); // initialize the  entries list
-        entriesSize = new ArrayList<Integer>(); 
-        sync = new SyncObj(); // initialize the synchronization data structure
-        zkc.close();
+
     }
-    
-    protected void tearDown(){
-        LOG.info("TearDown");
 
-        //shutdown bookie servers 
-        try {
-            bs1.shutdown();
-            bs2.shutdown();
-            bs3.shutdown();
-        } catch (InterruptedException e) {
-            e.printStackTrace();
-        }
-        cleanUpDir(tmpDirB1);
-        cleanUpDir(tmpDirB2);
-        cleanUpDir(tmpDirB3);
-        
-        //shutdown ZK server
-        serverFactory.shutdown();
-        assertTrue("waiting for server down",
-                ClientBase.waitForServerDown(HOSTPORT,
-                                             ClientBase.CONNECTION_TIMEOUT));
-        //ServerStats.unregister();
-        cleanUpDir(ZkTmpDir);
+    @Before
+    public void setUp() throws Exception{
+        super.setUp();
+        rng = new Random(System.currentTimeMillis()); // Initialize the Random
+                                                      // Number Generator
+        entries = new ArrayList<byte[]>(); // initialize the entries list
+        entriesSize = new ArrayList<Integer>();
+        sync = new SyncObj(); // initialize the synchronization data structure
         
     }
 
-    /*  Clean up a directory recursively */
-    protected boolean cleanUpDir(File dir){
+    /* Clean up a directory recursively */
+    protected boolean cleanUpDir(File dir) {
         if (dir.isDirectory()) {
             LOG.info("Cleaning up " + dir.getName());
             String[] children = dir.list();
             for (String string : children) {
                 boolean success = cleanUpDir(new File(dir, string));
-                if (!success) return false;
+                if (!success)
+                    return false;
             }
         }
         // The directory is now empty so delete it
-        return dir.delete();        
+        return dir.delete();
     }
 
-    /*  User for testing purposes, void */
-    class emptyWatcher implements Watcher{
-        public void process(WatchedEvent event) {}
+    /* User for testing purposes, void */
+    class emptyWatcher implements Watcher {
+        public void process(WatchedEvent event) {
+        }
     }
 
 }

+ 29 - 249
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/CloseTest.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.test;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,275 +21,54 @@ package org.apache.bookkeeper.test;
  * 
  */
 
-
-import static org.apache.zookeeper.test.ClientBase.CONNECTION_TIMEOUT;
-
-import java.lang.InterruptedException;
-import java.io.File;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import junit.framework.TestCase;
-
 import org.junit.*;
-import org.apache.bookkeeper.client.BKException;
-import org.apache.bookkeeper.client.BookKeeper;
 import org.apache.bookkeeper.client.LedgerHandle;
-import org.apache.bookkeeper.proto.BookieServer;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
 import org.apache.log4j.Logger;
 
-import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.CreateMode;
-import org.apache.zookeeper.ZooDefs.Ids;
-import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.WatchedEvent;
-import org.apache.zookeeper.ZooKeeper;
-import org.apache.zookeeper.server.NIOServerCnxn;
-import org.apache.zookeeper.server.ZooKeeperServer;
-import org.apache.zookeeper.server.ServerStats;
-import org.apache.zookeeper.test.ClientBase;
-
 /**
- * This unit test tests closing ledgers sequentially. 
- * It creates 4 ledgers, then write 1000 entries to each 
- * ledger and close it.
+ * This unit test tests closing ledgers sequentially. It creates 4 ledgers, then
+ * write 1000 entries to each ledger and close it.
  * 
  */
 
-public class CloseTest 
-extends TestCase 
-implements Watcher {
+public class CloseTest extends BaseTestCase{
     static Logger LOG = Logger.getLogger(LedgerRecoveryTest.class);
-    
-    BookieServer bs1, bs2, bs3;
-    File tmpDir1, tmpDir2, tmpDir3, tmpDirZK;
-    private static final String HOSTPORT = "127.0.0.1:33299";
-    private NIOServerCnxn.Factory serverFactory;
-    
-    private static String BOOKIEADDR1 = "127.0.0.1:33300";
-    private static String BOOKIEADDR2 = "127.0.0.1:33301";
-    private static String BOOKIEADDR3 = "127.0.0.1:33302";
-    
-    private static void recursiveDelete(File dir) {
-        File children[] = dir.listFiles();
-        if (children != null) {
-            for(File child: children) {
-                recursiveDelete(child);
-            }
-        }
-        dir.delete();
-    }
-    
-    protected void setUp() throws Exception {
-        /*
-         * Creates 3 BookieServers
-         */
-        
-        
-        tmpDir1 = File.createTempFile("bookie1", "test");
-        tmpDir1.delete();
-        tmpDir1.mkdir();
-        
-        final int PORT1 = Integer.parseInt(BOOKIEADDR1.split(":")[1]);
-        bs1 = new BookieServer(PORT1, tmpDir1, new File[] { tmpDir1 });
-        bs1.start();
-        
-        tmpDir2 = File.createTempFile("bookie2", "test");
-        tmpDir2.delete();
-        tmpDir2.mkdir();
-        
-        final int PORT2 = Integer.parseInt(BOOKIEADDR2.split(":")[1]);
-        bs2 = new BookieServer(PORT2, tmpDir2, new File[] { tmpDir2 });
-        bs2.start();
-        
-        tmpDir3 = File.createTempFile("bookie3", "test");
-        tmpDir3.delete();
-        tmpDir3.mkdir();
-        
-        final int PORT3 = Integer.parseInt(BOOKIEADDR3.split(":")[1]);
-        bs3 = new BookieServer(PORT3, tmpDir3, new File[] { tmpDir3 });
-        bs3.start();
-        
-        /*
-         * Instantiates a ZooKeeper server. This is a blind copy
-         * of setUp from SessionTest.java.
-         */
-        LOG.info("STARTING " + getName());
-
-        //ServerStats.registerAsConcrete();
-
-        tmpDirZK = ClientBase.createTmpDir();
-
-        ClientBase.setupTestEnv();
-        ZooKeeperServer zs = new ZooKeeperServer(tmpDirZK, tmpDirZK, 3000);
-        
-        final int PORT = Integer.parseInt(HOSTPORT.split(":")[1]);
-        serverFactory = new NIOServerCnxn.Factory(PORT);
-        serverFactory.startup(zs);
-
-        assertTrue("waiting for server up",
-                   ClientBase.waitForServerUp(HOSTPORT,
-                                              CONNECTION_TIMEOUT));
-        
-        /*
-         * Creating necessary znodes
-         */
-        try{
-            ZooKeeper zk = new ZooKeeper(HOSTPORT, 3000, this);
-            zk.create("/ledgers", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zk.create("/ledgers/available", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zk.create("/ledgers/available/" + BOOKIEADDR1, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT );
-            zk.create("/ledgers/available/" + BOOKIEADDR2, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT );
-            zk.create("/ledgers/available/" + BOOKIEADDR3, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT );
-            zk.close();
-        } catch (KeeperException ke) {
-            LOG.error(ke);
-            fail("Couldn't execute ZooKeeper start procedure");
-        }
-        
-    }
-    
-    /**
-     * Watcher method. 
-     */
-    synchronized public void process(WatchedEvent event) {
-        LOG.info("Process: " + event.getType() + " " + event.getPath());
-    }
-    
-    protected void tearDown() throws Exception {
-        LOG.info("### Tear down ###");
-        bs1.shutdown();
-        recursiveDelete(tmpDir1);
-        
-        bs2.shutdown();
-        recursiveDelete(tmpDir2);
-        
-        bs3.shutdown();
-        recursiveDelete(tmpDir3);
-        
-        serverFactory.shutdown();
-        assertTrue("waiting for server down",
-                   ClientBase.waitForServerDown(HOSTPORT,
-                                                CONNECTION_TIMEOUT));
+    DigestType digestType;
 
-        //ServerStats.unregister();
-        recursiveDelete(tmpDirZK);
-        LOG.info("FINISHED " + getName());
+    public CloseTest(DigestType digestType) {
+        super(3);
+        this.digestType = digestType;
     }
 
     @Test
-    public void testClose(){
-        /*
-         * Instantiate BookKeeper object.
-         */
-        BookKeeper bk = null;
-        try{
-            bk = new BookKeeper(HOSTPORT);
-        } catch (KeeperException ke){
-            LOG.error("Error instantiating BookKeeper", ke);
-            fail("ZooKeeper error");
-        } catch (IOException ioe){
-            LOG.error(ioe);
-            fail("Failure due to IOException");
-        }
-        
+    public void testClose() throws Exception {
+
         /*
          * Create 4 ledgers.
          */
-        LedgerHandle lh1 = null;
-        LedgerHandle lh2 = null;
-        LedgerHandle lh3 = null;
-        LedgerHandle lh4 = null;
-        
-        try{
-            lh1 = bk.createLedger("".getBytes());
-            lh2 = bk.createLedger("".getBytes());
-            lh3 = bk.createLedger("".getBytes());
-            lh4 = bk.createLedger("".getBytes());
-        } catch (KeeperException ke){
-            LOG.error("Error creating a ledger", ke);
-            fail("ZooKeeper error");            
-        } catch (BKException bke){
-            LOG.error("BookKeeper error");
-            fail("BookKeeper error");
-        } catch (InterruptedException ie) {
-            LOG.error(ie);
-            fail("Failure due to interrupted exception");
-        } catch (IOException ioe) {
-            LOG.error(ioe);
-            fail("Failure due to IO exception");
-        }
-        
-        /*
-         * Write a 1000 entries to lh1.
-         */
-        try{
-            String tmp = "BookKeeper is cool!";
-            for(int i = 0; i < 1000; i++){
-                lh1.addEntry(tmp.getBytes());
-            }
-        } catch(InterruptedException e){
-            LOG.error("Interrupted when adding entry", e);
-            fail("Couldn't finish adding entries");
-        } catch(BKException e){
-            LOG.error("BookKeeper exception", e);
-            fail("BookKeeper exception when adding entries");
-        }
-        
-        try{
-            lh1.close();
-        } catch(Exception e) {
-            LOG.error(e);
-            fail("Exception while closing ledger 1");
-        }
-        /*
-         * Write a 1000 entries to lh2.
-         */
-        try{
-            String tmp = "BookKeeper is cool!";
-            for(int i = 0; i < 1000; i++){
-                lh2.addEntry(tmp.getBytes());
-            }
-        } catch(InterruptedException e){
-            LOG.error("Interrupted when adding entry", e);
-            fail("Couldn't finish adding entries");
-        } catch(BKException e){
-            LOG.error("BookKeeper exception", e);
-            fail("CBookKeeper exception while adding entries");
-        }
-        
-        try{
-            lh2.close();
-        } catch(Exception e){
-            LOG.error(e);
-            fail("Exception while closing ledger 2");
+        int numLedgers = 4;
+        int numMsgs = 100;
+
+        LedgerHandle[] lh = new LedgerHandle[numLedgers];
+        for (int i = 0; i < numLedgers; i++) {
+            lh[i] = bkc.createLedger(digestType, "".getBytes());
         }
-        
+
+        String tmp = "BookKeeper is cool!";
+
         /*
-         * Write a 1000 entries to lh3 and lh4.
+         * Write 1000 entries to lh1.
          */
-        try{
-            String tmp = "BookKeeper is cool!";
-            for(int i = 0; i < 1000; i++){
-                lh3.addEntry(tmp.getBytes());
-                lh4.addEntry(tmp.getBytes());
+        for (int i = 0; i < numMsgs; i++) {
+            for (int j = 0; j < numLedgers; j++) {
+                lh[j].addEntry(tmp.getBytes());
             }
-        } catch(InterruptedException e){
-            LOG.error("Interrupted when adding entry", e);
-            fail("Couldn't finish adding entries");
-        } catch(BKException e){
-            LOG.error("BookKeeper exception", e);
-            fail("BookKeeper exception when adding entries");
         }
-        
-        try{
-            lh3.close();
-            lh4.close();
-        } catch(Exception e){
-            LOG.error(e);
-            fail("Exception while closing ledger 4");
+
+        for (int i = 0; i < numLedgers; i++) {
+
+            lh[i].close();
         }
-    }      
+    }
 }
-    
-    

+ 178 - 0
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/ConcurrentLedgerTest.java

@@ -0,0 +1,178 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.bookkeeper.test;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.nio.ByteBuffer;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.bookkeeper.bookie.Bookie;
+import org.apache.bookkeeper.bookie.BookieException;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.WriteCallback;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests writing to concurrent ledgers
+ */
+public class ConcurrentLedgerTest extends TestCase {
+    Bookie bookie;
+    File txnDir, ledgerDir;
+    int recvTimeout = 10000;
+    Semaphore throttle;
+    
+    @Override
+    @Before
+    public void setUp() throws IOException {
+        String txnDirName = System.getProperty("txnDir");
+        if (txnDirName != null) {
+            txnDir = new File(txnDirName);
+        }
+        String ledgerDirName = System.getProperty("ledgerDir");
+        if (ledgerDirName != null) {
+            ledgerDir = new File(ledgerDirName);
+        }
+        File tmpFile = File.createTempFile("book", ".txn", txnDir);
+        tmpFile.delete();
+        txnDir = new File(tmpFile.getParent(), tmpFile.getName()+".dir");
+        txnDir.mkdirs();
+        tmpFile = File.createTempFile("book", ".ledger", ledgerDir);
+        ledgerDir = new File(tmpFile.getParent(), tmpFile.getName()+".dir");
+        ledgerDir.mkdirs();
+        
+        bookie = new Bookie(txnDir, new File[] {ledgerDir});
+    }
+    
+    static void recursiveDelete(File f) {
+        if (f.isFile()) {
+            f.delete();
+        } else {
+            for(File i: f.listFiles()) {
+                recursiveDelete(i);
+            }
+            f.delete();
+        }
+    }
+    
+    @Override
+    @After
+    public void tearDown() {
+        try {
+            bookie.shutdown();
+            recursiveDelete(txnDir);
+            recursiveDelete(ledgerDir);
+        } catch (InterruptedException e) {
+            e.printStackTrace();
+        }
+    }
+
+    byte zeros[] = new byte[16];
+
+    int iterations = 51;
+    {
+        String iterationsString = System.getProperty("iterations");
+        if (iterationsString != null) {
+            iterations = Integer.parseInt(iterationsString);
+        }
+    }
+    int iterationStep = 25;
+    {
+        String iterationsString = System.getProperty("iterationStep");
+        if (iterationsString != null) {
+            iterationStep = Integer.parseInt(iterationsString);
+        }
+    }
+    @Test
+    public void testConcurrentWrite() throws IOException, InterruptedException, BookieException {
+        int size = 1024;
+        int totalwrites = 128;
+        if (System.getProperty("totalwrites") != null) {
+            totalwrites = Integer.parseInt(System.getProperty("totalwrites"));
+        }
+        System.out.println("Running up to " + iterations + " iterations");
+        System.out.println("Total writes = " + totalwrites);
+        int ledgers;
+        for(ledgers = 1; ledgers <= iterations; ledgers += iterationStep) {
+            long duration = doWrites(ledgers, size, totalwrites);
+            System.out.println(totalwrites + " on " + ledgers + " took " + duration + " ms");
+        }
+        System.out.println("ledgers " + ledgers);
+        for(ledgers = 1; ledgers <= iterations; ledgers += iterationStep) {
+            long duration = doReads(ledgers, size, totalwrites);
+            System.out.println(ledgers + " read " + duration + " ms");
+        }
+    }
+
+    private long doReads(int ledgers, int size, int totalwrites)
+            throws IOException, InterruptedException, BookieException {
+        long start = System.currentTimeMillis();
+        for(int i = 1; i <= totalwrites/ledgers; i++) {
+            for(int j = 1; j <= ledgers; j++) {
+                ByteBuffer entry = bookie.readEntry(j, i);
+                // skip the ledger id and the entry id
+                entry.getLong();
+                entry.getLong();
+                assertEquals(j + "@" + i, j+2, entry.getLong());
+                assertEquals(j + "@" + i, i+3, entry.getLong());
+            }
+        }
+        long finish = System.currentTimeMillis();
+        return finish - start;
+    }
+    private long doWrites(int ledgers, int size, int totalwrites)
+            throws IOException, InterruptedException, BookieException {
+        throttle = new Semaphore(10000);
+        WriteCallback cb = new WriteCallback() {
+            @Override
+            public void writeComplete(int rc, long ledgerId, long entryId,
+                    InetSocketAddress addr, Object ctx) {
+                AtomicInteger counter = (AtomicInteger)ctx;
+                counter.getAndIncrement();
+                throttle.release();
+            }
+        };
+        AtomicInteger counter = new AtomicInteger();
+        long start = System.currentTimeMillis();
+        for(int i = 1; i <= totalwrites/ledgers; i++) {
+            for(int j = 1; j <= ledgers; j++) {
+                ByteBuffer bytes = ByteBuffer.allocate(size);
+                bytes.putLong(j);
+                bytes.putLong(i);
+                bytes.putLong(j+2);
+                bytes.putLong(i+3);
+                bytes.put(("This is ledger " + j + " entry " + i).getBytes());
+                bytes.position(0);
+                bytes.limit(bytes.capacity());
+                throttle.acquire();
+                bookie.addEntry(bytes, cb, counter, zeros);
+            }
+        }
+        long finish = System.currentTimeMillis();
+        return finish - start;
+    }
+}

+ 36 - 288
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/LedgerRecoveryTest.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.test;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,318 +21,65 @@ package org.apache.bookkeeper.test;
  * 
  */
 
-
-import static org.apache.zookeeper.test.ClientBase.CONNECTION_TIMEOUT;
-
-import java.lang.InterruptedException;
-import java.io.File;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import junit.framework.TestCase;
-
 import org.junit.*;
-import org.apache.bookkeeper.client.BKException;
-import org.apache.bookkeeper.client.BookKeeper;
 import org.apache.bookkeeper.client.LedgerHandle;
-import org.apache.bookkeeper.proto.BookieServer;
+import org.apache.bookkeeper.client.BookKeeper.DigestType;
 import org.apache.log4j.Logger;
 
-import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.CreateMode;
-import org.apache.zookeeper.ZooDefs.Ids;
-import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.WatchedEvent;
-import org.apache.zookeeper.ZooKeeper;
-import org.apache.zookeeper.server.NIOServerCnxn;
-import org.apache.zookeeper.server.ZooKeeperServer;
-import org.apache.zookeeper.server.ServerStats;
-import org.apache.zookeeper.test.ClientBase;
-
 /**
- * This unit test tests ledger recovery. 
+ * This unit test tests ledger recovery.
+ * 
  * 
- *
  */
 
-public class LedgerRecoveryTest 
-extends TestCase 
-implements Watcher {
+public class LedgerRecoveryTest extends BaseTestCase {
     static Logger LOG = Logger.getLogger(LedgerRecoveryTest.class);
-    
-    BookieServer bs1, bs2, bs3;
-    File tmpDir1, tmpDir2, tmpDir3, tmpDirZK;
-    private static final String HOSTPORT = "127.0.0.1:33299";
-    private NIOServerCnxn.Factory serverFactory;
-    
-    private static String BOOKIEADDR1 = "127.0.0.1:33300";
-    private static String BOOKIEADDR2 = "127.0.0.1:33301";
-    private static String BOOKIEADDR3 = "127.0.0.1:33302";
-    
-    private static void recursiveDelete(File dir) {
-        File children[] = dir.listFiles();
-        if (children != null) {
-            for(File child: children) {
-                recursiveDelete(child);
-            }
-        }
-        dir.delete();
-    }
-    
-    protected void setUp() throws Exception {
-        /*
-         * Creates 3 BookieServers
-         */
-        
-        
-        tmpDir1 = File.createTempFile("bookie1", "test");
-        tmpDir1.delete();
-        tmpDir1.mkdir();
-        
-        final int PORT1 = Integer.parseInt(BOOKIEADDR1.split(":")[1]);
-        bs1 = new BookieServer(PORT1, tmpDir1, new File[] { tmpDir1 });
-        bs1.start();
-        
-        tmpDir2 = File.createTempFile("bookie2", "test");
-        tmpDir2.delete();
-        tmpDir2.mkdir();
-        
-        final int PORT2 = Integer.parseInt(BOOKIEADDR2.split(":")[1]);
-        bs2 = new BookieServer(PORT2, tmpDir2, new File[] { tmpDir2 });
-        bs2.start();
-        
-        tmpDir3 = File.createTempFile("bookie3", "test");
-        tmpDir3.delete();
-        tmpDir3.mkdir();
-        
-        final int PORT3 = Integer.parseInt(BOOKIEADDR3.split(":")[1]);
-        bs3 = new BookieServer(PORT3, tmpDir3, new File[] { tmpDir3 });
-        bs3.start();
-        
-        /*
-         * Instantiates a ZooKeeper server. This is a blind copy
-         * of setUp from SessionTest.java.
-         */
-        LOG.info("STARTING " + getName());
-
-        //ServerStats.registerAsConcrete();
 
-        tmpDirZK = ClientBase.createTmpDir();
+    DigestType digestType;
 
-        ClientBase.setupTestEnv();
-        ZooKeeperServer zs = new ZooKeeperServer(tmpDirZK, tmpDirZK, 3000);
-        
-        final int PORT = Integer.parseInt(HOSTPORT.split(":")[1]);
-        serverFactory = new NIOServerCnxn.Factory(PORT);
-        serverFactory.startup(zs);
-
-        assertTrue("waiting for server up",
-                   ClientBase.waitForServerUp(HOSTPORT,
-                                              CONNECTION_TIMEOUT));
-        
-        /*
-         * Creating necessary znodes
-         */
-        try{
-            ZooKeeper zk = new ZooKeeper(HOSTPORT, 3000, this);
-            zk.create("/ledgers", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zk.create("/ledgers/available", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-            zk.create("/ledgers/available/" + BOOKIEADDR1, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT );
-            zk.create("/ledgers/available/" + BOOKIEADDR2, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT );
-            zk.create("/ledgers/available/" + BOOKIEADDR3, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT );
-            zk.close();
-        } catch (KeeperException ke) {
-            LOG.error(ke);
-            fail("Couldn't execute ZooKeeper start procedure");
-        }
-        
-    }
-    
-    /**
-     * Watcher method. 
-     */
-    synchronized public void process(WatchedEvent event) {
-        LOG.info("Process: " + event.getType() + " " + event.getPath());
+    public LedgerRecoveryTest(DigestType digestType) {
+        super(3);
+        this.digestType = digestType;
     }
-    
-    protected void tearDown() throws Exception {
-        LOG.info("### Tear down ###");
-        bs1.shutdown();
-        recursiveDelete(tmpDir1);
-        
-        bs2.shutdown();
-        recursiveDelete(tmpDir2);
-        
-        bs3.shutdown();
-        recursiveDelete(tmpDir3);
-        
-        serverFactory.shutdown();
-        assertTrue("waiting for server down",
-                   ClientBase.waitForServerDown(HOSTPORT,
-                                                CONNECTION_TIMEOUT));
 
-        //ServerStats.unregister();
-        recursiveDelete(tmpDirZK);
-        LOG.info("FINISHED " + getName());
-    }
-    
-    @Test
-    public void testLedgerRecovery(){
-        /*
-         * Instantiate BookKeeper object.
-         */
-        BookKeeper bk = null;
-        try{
-            bk = new BookKeeper(HOSTPORT);
-        } catch (KeeperException ke){
-            LOG.error("Error instantiating BookKeeper", ke);
-            fail("ZooKeeper error");
-        } catch (IOException ioe){
-            LOG.error(ioe);
-            fail("Failure due to IOException");
-        }
-        
+    private void testInternal(int numEntries) throws Exception {
         /*
          * Create ledger.
          */
         LedgerHandle beforelh = null;
-        try{
-            beforelh = bk.createLedger("".getBytes());
-        } catch (KeeperException ke){
-            LOG.error("Error creating a ledger", ke);
-            fail("ZooKeeper error");            
-        } catch (BKException bke){
-            LOG.error("BookKeeper error");
-            fail("BookKeeper error");
-        } catch (InterruptedException ie) {
-            LOG.error(ie);
-            fail("Failure due to interrupted exception");
-        } catch (IOException ioe) {
-            LOG.error(ioe);
-            fail("Failure due to IO exception");
+        beforelh = bkc.createLedger(digestType, "".getBytes());
+
+        String tmp = "BookKeeper is cool!";
+        for (int i = 0; i < numEntries; i++) {
+            beforelh.addEntry(tmp.getBytes());
         }
-        
+
         /*
-         * Write a 1000 entries.
+         * Try to open ledger.
          */
-        try{
-            String tmp = "BookKeeper is cool!";
-            for(int i = 0; i < 1000; i++){
-                beforelh.addEntry(tmp.getBytes());
-            }
-            
-            //bk.resetLedger(beforelh);
-        } catch(InterruptedException e){
-            LOG.error("Interrupted when adding entry", e);
-            fail("Couldn't finish adding entries");
-        } catch(BKException e){
-            LOG.error("BookKeeper exception", e);
-            fail("BookKeeper exception while adding entries");
-        }
-        
+        LedgerHandle afterlh = bkc.openLedger(beforelh.getId(), digestType, "".getBytes());
+
         /*
-         * Try to open ledger.
+         * Check if has recovered properly.
          */
-        try{
-            LedgerHandle afterlh = bk.openLedger(beforelh.getId(), "".getBytes());
-            
-            /*
-             * Check if has recovered properly.
-             */
-            assertTrue("Has not recovered correctly: " + afterlh.getLast(), afterlh.getLast() == 999);
-        } catch (KeeperException e) {
-            LOG.error("Error when opening ledger", e);
-            fail("Couldn't open ledger");
-        } catch (InterruptedException ie) {
-            LOG.error("Interrupted exception", ie);
-            fail("Failure due to interrupted exception");
-        } catch (IOException ioe) {
-            LOG.error("IO Exception", ioe);
-            fail("Failure due to IO exception");
-        } catch (BKException bke){
-            LOG.error("BookKeeper error", bke);
-            fail("BookKeeper error");
-        }
-        
+        assertTrue("Has not recovered correctly: " + afterlh.getLastAddConfirmed(),
+                afterlh.getLastAddConfirmed() == numEntries - 1);        
     }
     
     @Test
-    public void testEmptyLedgerRecovery(){
-        /*
-         * Instantiate BookKeeper object.
-         */
-        BookKeeper bk = null;
-        try{
-            bk = new BookKeeper(HOSTPORT);
-        } catch (KeeperException ke){
-            LOG.error("Error instantiating BookKeeper", ke);
-            fail("ZooKeeper error");
-        } catch (IOException ioe){
-            LOG.error(ioe);
-            fail("Failure due to IOException");
-        }
-        
-        /*
-         * Create ledger.
-         */
-        LedgerHandle beforelh = null;
-        try{
-            beforelh = bk.createLedger("".getBytes());
-        } catch (KeeperException ke){
-            LOG.error("Error creating a ledger", ke);
-            fail("ZooKeeper error");            
-        } catch (BKException bke){
-            LOG.error("BookKeeper error");
-            fail("BookKeeper error");
-        } catch (InterruptedException ie) {
-            LOG.error(ie);
-            fail("Failure due to interrupted exception");
-        } catch (IOException ioe) {
-            LOG.error(ioe);
-            fail("Failure due to IO exception");
-        }
-        
-        /*
-         * Write a 1 entry.
-         */
-        try{
-            String tmp = "BookKeeper is cool!";
-            for(int i = 0; i < 1; i++){
-                beforelh.addEntry(tmp.getBytes());
-            }
-        } catch(InterruptedException e){
-            LOG.error("Interrupted when adding entry", e);
-            fail("Couldn't finish adding entries");
-        } catch(BKException e){
-            LOG.error("BookKeeper exception", e);
-            fail("BookKeeper exception while adding entries");
-        }
-        
-        
-        /*
-         * Try to open ledger.
-         */
-        try{
-            LedgerHandle afterlh = bk.openLedger(beforelh.getId(), "".getBytes());
-            
-            /*
-             * Check if has recovered properly.
-             */
-            assertTrue("Has not recovered correctly: " + afterlh.getLast(), afterlh.getLast() == 0);
-        } catch (KeeperException e) {
-            LOG.error("Error when opening ledger", e);
-            fail("Couldn't open ledger");
-        } catch (InterruptedException ie) {
-            LOG.error("Interrupted exception", ie);
-            fail("Failure due to interrupted exception");
-        } catch (IOException ioe) {
-            LOG.error("IO Exception", ioe);
-            fail("Failure due to IO exception");
-        } catch (BKException bke){
-            LOG.error("BookKeeper error", bke);
-            fail("BookKeeper error");
-        }
-        
+    public void testLedgerRecovery() throws Exception {
+        testInternal(100);
+     
     }
-    
+
+    @Test
+    public void testEmptyLedgerRecoveryOne() throws Exception{
+        testInternal(1);
+    }
+
+    @Test
+    public void testEmptyLedgerRecovery() throws Exception{
+        testInternal(0);
+    }
+
 }

+ 43 - 48
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/LoopbackClient.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.test;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,103 +21,97 @@ package org.apache.bookkeeper.test;
  * 
  */
 
-
 import java.net.InetSocketAddress;
-import java.nio.ByteBuffer;
 import java.io.IOException;
 import java.lang.InterruptedException;
 import java.util.Arrays;
+import java.util.concurrent.Executors;
 
 import org.apache.bookkeeper.proto.BookieClient;
-import org.apache.bookkeeper.proto.WriteCallback;
+import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.WriteCallback;
+import org.apache.bookkeeper.util.OrderedSafeExecutor;
 import org.apache.log4j.Logger;
-
+import org.jboss.netty.buffer.ChannelBuffers;
+import org.jboss.netty.channel.socket.ClientSocketChannelFactory;
+import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory;
 
 /**
  * This class tests BookieClient. It just sends the a new entry to itself.
  * 
  * 
- *
+ * 
  */
 
-
 class LoopbackClient implements WriteCallback {
     Logger LOG = Logger.getLogger(LoopbackClient.class);
     BookieClient client;
     static int recvTimeout = 2000;
     long begin = 0;
     int limit;
-    
-    
+    OrderedSafeExecutor executor;
+
     static class Counter {
         int c;
         int limit;
-        
-        Counter(int limit){
+
+        Counter(int limit) {
             this.c = 0;
             this.limit = limit;
         }
-        
-        synchronized void increment(){
-            if(++c == limit) 
+
+        synchronized void increment() {
+            if (++c == limit)
                 this.notify();
         }
     }
-    
-    LoopbackClient(int port, long begin, int limit)
-    throws IOException {
-        this.client = 
-            new BookieClient(new InetSocketAddress("127.0.0.1", port), recvTimeout);
+
+    LoopbackClient(ClientSocketChannelFactory channelFactory, OrderedSafeExecutor executor, long begin, int limit) throws IOException {
+        this.client = new BookieClient(channelFactory, executor);
         this.begin = begin;
     }
-    
-    
-    void write(long ledgerId, long entry, byte[] data, WriteCallback cb, Object ctx)
-    throws IOException, InterruptedException {
+
+    void write(long ledgerId, long entry, byte[] data, InetSocketAddress addr, WriteCallback cb, Object ctx)
+            throws IOException, InterruptedException {
         LOG.info("Ledger id: " + ledgerId + ", Entry: " + entry);
         byte[] passwd = new byte[20];
         Arrays.fill(passwd, (byte) 'a');
-        
-        client.addEntry(ledgerId, 
-            passwd,
-            entry, 
-            ByteBuffer.wrap(data), 
-            cb,
-            ctx);
+
+        client.addEntry(addr, ledgerId, passwd, entry, ChannelBuffers.wrappedBuffer(data), cb, ctx);
     }
-    
-    public void writeComplete(int rc, long ledgerId, long entryId, Object ctx){
+
+    public void writeComplete(int rc, long ledgerId, long entryId, InetSocketAddress addr, Object ctx) {
         Counter counter = (Counter) ctx;
         counter.increment();
     }
-    
-    
-    public static void main(String args[]){
+
+    public static void main(String args[]) {
         byte[] data = new byte[Integer.parseInt(args[0])];
         Integer limit = Integer.parseInt(args[1]);
         Counter c = new Counter(limit);
         long ledgerId = Long.valueOf("0").longValue();
         long begin = System.currentTimeMillis();
-        
+
         LoopbackClient lb;
-        try{
-            lb = new LoopbackClient(Integer.valueOf(args[2]).intValue(), 
-                    begin, 
-                    limit.intValue());
-        
-            for(int i = 0; i < limit ; i++){
-                lb.write(ledgerId, i, data, lb, c);   
+        ClientSocketChannelFactory channelFactory = new NioClientSocketChannelFactory(Executors.newCachedThreadPool(), Executors
+                .newCachedThreadPool());
+        OrderedSafeExecutor executor = new OrderedSafeExecutor(2);
+        try {
+            InetSocketAddress addr = new InetSocketAddress("127.0.0.1", Integer.valueOf(args[2]).intValue());
+            lb = new LoopbackClient(channelFactory, executor, begin, limit.intValue());
+
+            for (int i = 0; i < limit; i++) {
+                lb.write(ledgerId, i, data, addr, lb, c);
             }
-            
-            synchronized(c){
+
+            synchronized (c) {
                 c.wait();
                 System.out.println("Time to write all entries: " + (System.currentTimeMillis() - begin));
             }
-        } catch (IOException e){
+        } catch (IOException e) {
             e.printStackTrace();
-        } catch (InterruptedException e){
+        } catch (InterruptedException e) {
             e.printStackTrace();
         }
-    } 
-    
+    }
+
 }

+ 3 - 3
src/contrib/bookkeeper/test/org/apache/bookkeeper/test/NIOServerFactoryTest.java

@@ -1,4 +1,5 @@
 package org.apache.bookkeeper.test;
+
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -20,7 +21,6 @@ package org.apache.bookkeeper.test;
  * 
  */
 
-
 import java.net.Socket;
 import java.nio.ByteBuffer;
 
@@ -29,7 +29,6 @@ import org.apache.bookkeeper.proto.NIOServerFactory.Cnxn;
 import org.apache.bookkeeper.proto.NIOServerFactory.PacketProcessor;
 import org.junit.Test;
 
-
 import junit.framework.TestCase;
 
 public class NIOServerFactoryTest extends TestCase {
@@ -41,8 +40,9 @@ public class NIOServerFactoryTest extends TestCase {
             }
             src.sendResponse(new ByteBuffer[] { ByteBuffer.allocate(4) });
         }
-        
+
     };
+
     @Test
     public void testProblemProcessor() throws Exception {
         NIOServerFactory factory = new NIOServerFactory(22334, problemProcessor);