17 years ago · 75873a0b88
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -123,6 +123,10 @@ Trunk (unreleased changes)
 
				     HADOOP-2423.  Code optimization in FSNamesystem.mkdirs.
			
 
				     (Tsz Wo (Nicholas), SZE via dhruba)
			
 
				 
			
 
				+		HADOOP-2606. ReplicationMonitor selects data-nodes to replicate directly
			
 
				+		from needed replication blocks instead of looking up for the blocks for 
			
 
				+		each live data-node. (shv)
			
 
				+
			
 
				   BUG FIXES
			
 
				 
			
 
				     HADOOP-2195. '-mkdir' behaviour is now closer to Linux shell in case of
			
--- a/src/java/org/apache/hadoop/dfs/DataNode.java
+++ b/src/java/org/apache/hadoop/dfs/DataNode.java
@@ -97,7 +97,7 @@ public class DataNode implements FSConstants, Runnable {
 
				   volatile boolean shouldRun = true;
			
 
				   private LinkedList<Block> receivedBlockList = new LinkedList<Block>();
			
 
				   private LinkedList<String> delHints = new LinkedList<String>();
			
 
				-  final private static String EMPTY_DEL_HINT = "";
			
 
				+  final static String EMPTY_DEL_HINT = "";
			
 
				   int xmitsInProgress = 0;
			
 
				   Daemon dataXceiveServer = null;
			
 
				   ThreadGroup threadGroup = null;
			
--- a/src/java/org/apache/hadoop/dfs/DatanodeDescriptor.java
+++ b/src/java/org/apache/hadoop/dfs/DatanodeDescriptor.java
@@ -214,30 +214,25 @@ public class DatanodeDescriptor extends DatanodeInfo {
 
				     return new BlockIterator(this.blockList, this);
			
 
				   }
			
 
				   
			
 
				-  /*
			
 
				+  /**
			
 
				    * Store block replication work.
			
 
				    */
			
 
				-  void addBlocksToBeReplicated(Block[] blocklist, 
			
 
				-                               DatanodeDescriptor[][] targets) {
			
 
				-    assert(blocklist != null && targets != null);
			
 
				-    assert(blocklist.length > 0 && targets.length > 0);
			
 
				+  void addBlockToBeReplicated(Block block, DatanodeDescriptor[] targets) {
			
 
				+    assert(block != null && targets != null && targets.length > 0);
			
 
				     synchronized (replicateBlocks) {
			
 
				-      assert(blocklist.length == targets.length);
			
 
				-      for (int i = 0; i < blocklist.length; i++) {
			
 
				-        replicateBlocks.add(blocklist[i]);
			
 
				-        replicateTargetSets.add(targets[i]);
			
 
				-      }
			
 
				+      replicateBlocks.add(block);
			
 
				+      replicateTargetSets.add(targets);
			
 
				     }
			
 
				   }
			
 
				 
			
 
				-  /*
			
 
				+  /**
			
 
				    * Store block invalidation work.
			
 
				    */
			
 
				-  void addBlocksToBeInvalidated(Block[] blocklist) {
			
 
				-    assert(blocklist != null && blocklist.length > 0);
			
 
				+  void addBlocksToBeInvalidated(List<Block> blocklist) {
			
 
				+    assert(blocklist != null && blocklist.size() > 0);
			
 
				     synchronized (invalidateBlocks) {
			
 
				-      for (int i = 0; i < blocklist.length; i++) {
			
 
				-        invalidateBlocks.add(blocklist[i]);
			
 
				+      for(Block blk : blocklist) {
			
 
				+        invalidateBlocks.add(blk);
			
 
				       }
			
 
				     }
			
 
				   }
			
--- a/src/java/org/apache/hadoop/dfs/FSNamesystem.java
+++ b/src/java/org/apache/hadoop/dfs/FSNamesystem.java
@@ -203,8 +203,11 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				   private long decommissionRecheckInterval;
			
 
				   // default block size of a file
			
 
				   private long defaultBlockSize = 0;
			
 
				-  private int replIndex = 0; // last datanode used for replication work
			
 
				-  static int REPL_WORK_PER_ITERATION = 32; // max percent datanodes per iteration
			
 
				+
			
 
				+  /**
			
 
				+   * Last block index used for replication work.
			
 
				+   */
			
 
				+  private int replIndex = 0;
			
 
				 
			
 
				   public static FSNamesystem fsNamesystemObject;
			
 
				   private String localMachine;
			
@@ -392,10 +395,10 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				         "heartbeat.recheck.interval", 5 * 60 * 1000); // 5 minutes
			
 
				     this.heartbeatExpireInterval = 2 * heartbeatRecheckInterval +
			
 
				       10 * heartbeatInterval;
			
 
				-    this.replicationRecheckInterval = 3 * 1000; //  3 second
			
 
				-    this.replicationRecheckInterval = conf.getInt("dfs.replication.interval", 3) * 1000;
			
 
				-    this.decommissionRecheckInterval = conf.getInt("dfs.namenode.decommission.interval",
			
 
				-                                                   5 * 60) * 1000;
			
 
				+    this.replicationRecheckInterval = 
			
 
				+      conf.getInt("dfs.replication.interval", 3) * 1000L;
			
 
				+    this.decommissionRecheckInterval = 
			
 
				+      conf.getInt("dfs.namenode.decommission.interval", 5 * 60) * 1000L;
			
 
				     this.defaultBlockSize = conf.getLong("dfs.block.size", DEFAULT_BLOCK_SIZE);
			
 
				     this.maxFsObjects = conf.getLong("dfs.max.objects", 0);
			
 
				     this.blockInvalidateLimit = Math.max(this.blockInvalidateLimit, 
			
@@ -474,18 +477,14 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				     synchronized (neededReplications) {
			
 
				       out.println("Metasave: Blocks waiting for replication: " + 
			
 
				                   neededReplications.size());
			
 
				-      if (neededReplications.size() > 0) {
			
 
				-        for (Iterator<Block> it = neededReplications.iterator(); 
			
 
				-             it.hasNext();) {
			
 
				-          Block block = it.next();
			
 
				-          out.print(block);
			
 
				-          for (Iterator<DatanodeDescriptor> jt = blocksMap.nodeIterator(block);
			
 
				-               jt.hasNext();) {
			
 
				-            DatanodeDescriptor node = jt.next();
			
 
				-            out.print(" " + node + " : ");
			
 
				-          }
			
 
				-          out.println("");
			
 
				+      for (Block block : neededReplications) {
			
 
				+        out.print(block);
			
 
				+        for (Iterator<DatanodeDescriptor> jt = blocksMap.nodeIterator(block);
			
 
				+             jt.hasNext();) {
			
 
				+          DatanodeDescriptor node = jt.next();
			
 
				+          out.print(" " + node + " : ");
			
 
				         }
			
 
				+        out.println("");
			
 
				       }
			
 
				     }
			
 
				 
			
@@ -2212,6 +2211,8 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				    * Periodically calls computeReplicationWork().
			
 
				    */
			
 
				   class ReplicationMonitor implements Runnable {
			
 
				+    static final int INVALIDATE_WORK_PCT_PER_ITERATION = 32;
			
 
				+    static final float REPLICATION_WORK_MULTIPLIER_PER_ITERATION = 2;
			
 
				     public void run() {
			
 
				       while (fsRunning) {
			
 
				         try {
			
@@ -2219,6 +2220,8 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				           processPendingReplications();
			
 
				           Thread.sleep(replicationRecheckInterval);
			
 
				         } catch (InterruptedException ie) {
			
 
				+          LOG.warn("ReplicationMonitor thread received InterruptedException." + ie);
			
 
				+          break;
			
 
				         } catch (IOException ie) {
			
 
				           LOG.warn("ReplicationMonitor thread received exception. " + ie);
			
 
				         } catch (Throwable t) {
			
@@ -2229,81 +2232,276 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				     }
			
 
				   }
			
 
				 
			
 
				+  /////////////////////////////////////////////////////////
			
 
				+  //
			
 
				+  // These methods are called by the Namenode system, to see
			
 
				+  // if there is any work for registered datanodes.
			
 
				+  //
			
 
				+  /////////////////////////////////////////////////////////
			
 
				   /**
			
 
				-   * Look at a few datanodes and compute any replication work that 
			
 
				-   * can be scheduled on them. The datanode will be infomed of this
			
 
				-   * work at the next heartbeat.
			
 
				+   * Compute block replication and block invalidation work 
			
 
				+   * that can be scheduled on data-nodes.
			
 
				+   * The datanode will be informed of this work at the next heartbeat.
			
 
				+   * 
			
 
				+   * @return number of blocks scheduled for replication or removal.
			
 
				    */
			
 
				-  void computeDatanodeWork() throws IOException {
			
 
				-    int numiter = 0;
			
 
				-    int foundwork = 0;
			
 
				-    int hsize = 0;
			
 
				-    int lastReplIndex = -1;
			
 
				+  int computeDatanodeWork() throws IOException {
			
 
				+    int workFound = 0;
			
 
				+    int blocksToProcess = 0;
			
 
				+    int nodesToProcess = 0;
			
 
				+    synchronized(heartbeats) {
			
 
				+      blocksToProcess = (int)(heartbeats.size() 
			
 
				+          * ReplicationMonitor.REPLICATION_WORK_MULTIPLIER_PER_ITERATION);
			
 
				+      nodesToProcess = (int)Math.ceil((double)heartbeats.size() 
			
 
				+          * ReplicationMonitor.INVALIDATE_WORK_PCT_PER_ITERATION / 100);
			
 
				+    }
			
 
				 
			
 
				-    while (true) {
			
 
				-      DatanodeDescriptor node = null;
			
 
				+    workFound = computeReplicationWork(blocksToProcess); 
			
 
				+    if(workFound == 0)
			
 
				+      workFound = computeInvalidateWork(nodesToProcess);
			
 
				+    return workFound;
			
 
				+  }
			
 
				 
			
 
				-      //
			
 
				-      // pick the datanode that was the last one in the
			
 
				-      // previous invocation of this method.
			
 
				-      //
			
 
				-      synchronized (heartbeats) {
			
 
				-        hsize = heartbeats.size();
			
 
				-        if (numiter++ >= hsize) {
			
 
				-          // no change in replIndex.
			
 
				-          if (lastReplIndex >= 0) {
			
 
				-            //next time, start after where the last replication was scheduled
			
 
				-            replIndex = lastReplIndex;
			
 
				-          }
			
 
				-          break;
			
 
				-        }
			
 
				-        if (replIndex >= hsize) {
			
 
				+  private int computeInvalidateWork(int nodesToProcess) {
			
 
				+    int blockCnt = 0;
			
 
				+    for(int nodeCnt = 0; nodeCnt < nodesToProcess; nodeCnt++ ) {
			
 
				+      int work = invalidateWorkForOneNode();
			
 
				+      if(work == 0)
			
 
				+        break;
			
 
				+      blockCnt += work;
			
 
				+    }
			
 
				+    return blockCnt;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Scan blocks in {@link #neededReplications} and assign replication
			
 
				+   * work to data-nodes they belong to. 
			
 
				+   * 
			
 
				+   * The number of process blocks equals either twice the number of live 
			
 
				+   * data-nodes or the number of under-replicated blocks whichever is less.
			
 
				+   * 
			
 
				+   * @return number of blocks scheduled for replication during this iteration.
			
 
				+   */
			
 
				+  private synchronized int computeReplicationWork(
			
 
				+                                  int blocksToProcess) throws IOException {
			
 
				+    int scheduledReplicationCount = 0;
			
 
				+    // blocks should not be replicated or removed if safe mode is on
			
 
				+    if (isInSafeMode())
			
 
				+      return scheduledReplicationCount;
			
 
				+
			
 
				+    synchronized(neededReplications) {
			
 
				+      // # of blocks to process equals either twice the number of live 
			
 
				+      // data-nodes or the number of under-replicated blocks whichever is less
			
 
				+      blocksToProcess = Math.min(blocksToProcess, neededReplications.size());
			
 
				+      if(blocksToProcess == 0)
			
 
				+        return scheduledReplicationCount;
			
 
				+
			
 
				+      // Go through all blocks that need replications.
			
 
				+      // Select source and target nodes for replication.
			
 
				+      Iterator<Block> neededReplicationsIterator = neededReplications.iterator();
			
 
				+      // skip to the first unprocessed block, which is at replIndex 
			
 
				+      for(int i=0; i < replIndex && neededReplicationsIterator.hasNext(); i++) {
			
 
				+        neededReplicationsIterator.next();
			
 
				+      }
			
 
				+      // process blocks
			
 
				+      for(int blkCnt = 0; blkCnt < blocksToProcess; blkCnt++, replIndex++) {
			
 
				+        if( ! neededReplicationsIterator.hasNext()) {
			
 
				+          // start from the beginning
			
 
				           replIndex = 0;
			
 
				+          blocksToProcess = Math.min(blocksToProcess, neededReplications.size());
			
 
				+          if(blkCnt >= blocksToProcess)
			
 
				+            break;
			
 
				+          neededReplicationsIterator = neededReplications.iterator();
			
 
				+          assert neededReplicationsIterator.hasNext() : 
			
 
				+                                  "neededReplications should not be empty.";
			
 
				         }
			
 
				-        node = heartbeats.get(replIndex);
			
 
				-        replIndex++;
			
 
				-      }
			
 
				 
			
 
				-      //
			
 
				-      // Is there replication work to be computed for this datanode?
			
 
				-      //
			
 
				-      int precomputed = node.getNumberOfBlocksToBeReplicated();
			
 
				-      int needed = this.maxReplicationStreams - precomputed;
			
 
				-      boolean doReplication = false;
			
 
				-      boolean doInvalidation = false;
			
 
				-      if (needed > 0) {
			
 
				-        //
			
 
				-        // Compute replication work and store work into the datanode
			
 
				-        //
			
 
				-        Object replsets[] = pendingTransfers(node, needed);
			
 
				-        if (replsets != null) {
			
 
				-          doReplication = true;
			
 
				-          addBlocksToBeReplicated(node, (Block[])replsets[0], 
			
 
				-                                  (DatanodeDescriptor[][])replsets[1]);
			
 
				-          lastReplIndex = replIndex;
			
 
				+        Block block = neededReplicationsIterator.next();
			
 
				+
			
 
				+        // block should belong to a file
			
 
				+        INodeFile fileINode = blocksMap.getINode(block);
			
 
				+        if(fileINode == null) { // abandoned block 
			
 
				+          neededReplicationsIterator.remove(); // remove from neededReplications
			
 
				+          replIndex--;
			
 
				+          continue;
			
 
				         }
			
 
				-      }
			
 
				-      if (!doReplication) {
			
 
				-        //
			
 
				-        // Determine if block deletion is pending for this datanode
			
 
				-        //
			
 
				-        Block blocklist[] = blocksToInvalidate(node);
			
 
				-        if (blocklist != null) {
			
 
				-          doInvalidation = true;
			
 
				-          addBlocksToBeInvalidated(node, blocklist);
			
 
				+        int requiredReplication = fileINode.getReplication(); 
			
 
				+
			
 
				+        // get a source data-node
			
 
				+        List<DatanodeDescriptor> containingNodes =
			
 
				+                                          new ArrayList<DatanodeDescriptor>();
			
 
				+        NumberReplicas numReplicas = new NumberReplicas();
			
 
				+        DatanodeDescriptor srcNode = 
			
 
				+          chooseSourceDatanode(block, containingNodes, numReplicas);
			
 
				+        if(srcNode == null) // block can not be replicated from any node
			
 
				+          continue;
			
 
				+
			
 
				+        // do not schedule more if enough replicas is already pending
			
 
				+        int numEffectiveReplicas = numReplicas.liveReplicas() +
			
 
				+                                pendingReplications.getNumReplicas(block);
			
 
				+        if(numEffectiveReplicas >= requiredReplication) {
			
 
				+          neededReplicationsIterator.remove(); // remove from neededReplications
			
 
				+          replIndex--;
			
 
				+          NameNode.stateChangeLog.info("BLOCK* "
			
 
				+              + "Removing block " + block.getBlockName()
			
 
				+              + " from neededReplications as it does not belong to any file.");
			
 
				+          continue;
			
 
				         }
			
 
				-      }
			
 
				-      if (doReplication || doInvalidation) {
			
 
				-        //
			
 
				-        // If we have already computed work for a predefined
			
 
				-        // number of datanodes in this iteration, then relax
			
 
				-        //
			
 
				-        if (foundwork > ((hsize * REPL_WORK_PER_ITERATION)/100)) {
			
 
				-          break;
			
 
				+
			
 
				+        // choose replication targets
			
 
				+        int maxTargets = 
			
 
				+          maxReplicationStreams - srcNode.getNumberOfBlocksToBeReplicated();
			
 
				+        assert maxTargets > 0 : "Datanode " + srcNode.getName() 
			
 
				+              + " should have not been selected as a source for replication.";
			
 
				+        DatanodeDescriptor targets[] = replicator.chooseTarget(
			
 
				+            Math.min(requiredReplication - numEffectiveReplicas, maxTargets),
			
 
				+            srcNode, containingNodes, null, block.getNumBytes());
			
 
				+        if(targets.length == 0)
			
 
				+          continue;
			
 
				+        // Add block to the to be replicated list
			
 
				+        srcNode.addBlockToBeReplicated(block, targets);
			
 
				+        scheduledReplicationCount++;
			
 
				+
			
 
				+        // Move the block-replication into a "pending" state.
			
 
				+        // The reason we use 'pending' is so we can retry
			
 
				+        // replications that fail after an appropriate amount of time.
			
 
				+        if(numEffectiveReplicas + targets.length >= requiredReplication) {
			
 
				+          neededReplicationsIterator.remove(); // remove from neededReplications
			
 
				+          replIndex--;
			
 
				+          pendingReplications.add(block, targets.length);
			
 
				+          NameNode.stateChangeLog.debug(
			
 
				+              "BLOCK* block " + block.getBlockName()
			
 
				+              + " is moved from neededReplications to pendingReplications");
			
 
				         }
			
 
				-        foundwork++;
			
 
				-      } 
			
 
				+        if (NameNode.stateChangeLog.isInfoEnabled()) {
			
 
				+          StringBuffer targetList = new StringBuffer("datanode(s)");
			
 
				+          for (int k = 0; k < targets.length; k++) {
			
 
				+            targetList.append(' ');
			
 
				+            targetList.append(targets[k].getName());
			
 
				+          }
			
 
				+          NameNode.stateChangeLog.info(
			
 
				+                    "BLOCK* ask "
			
 
				+                    + srcNode.getName() + " to replicate "
			
 
				+                    + block.getBlockName() + " to " + targetList);
			
 
				+          NameNode.stateChangeLog.debug(
			
 
				+                    "BLOCK* neededReplications = " + neededReplications.size()
			
 
				+                    + " pendingReplications = " + pendingReplications.size());
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    return scheduledReplicationCount;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Parse the data-nodes the block belongs to and choose one,
			
 
				+   * which will be the replication source.
			
 
				+   * 
			
 
				+   * We prefer nodes that are in DECOMMISSION_INPROGRESS state to other nodes
			
 
				+   * since the former do not have write traffic and hence are less busy.
			
 
				+   * We do not use already decommissioned nodes as a source.
			
 
				+   * Otherwise we choose a random node among those that did not reach their 
			
 
				+   * replication limit.
			
 
				+   * 
			
 
				+   * In addition form a list of all nodes containing the block
			
 
				+   * and calculate its replication numbers.
			
 
				+   */
			
 
				+  private DatanodeDescriptor chooseSourceDatanode(
			
 
				+                                    Block block,
			
 
				+                                    List<DatanodeDescriptor> containingNodes,
			
 
				+                                    NumberReplicas numReplicas) {
			
 
				+    containingNodes.clear();
			
 
				+    DatanodeDescriptor srcNode = null;
			
 
				+    int live = 0;
			
 
				+    int decommissioned = 0;
			
 
				+    Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
			
 
				+    while(it.hasNext()) {
			
 
				+      DatanodeDescriptor node = it.next();
			
 
				+      if(!node.isDecommissionInProgress() && !node.isDecommissioned())
			
 
				+        live++;
			
 
				+      else
			
 
				+        decommissioned++;
			
 
				+      containingNodes.add(node);
			
 
				+      if(node.getNumberOfBlocksToBeReplicated() >= maxReplicationStreams)
			
 
				+        continue; // already reached replication limit
			
 
				+      // the block must not be scheduled for removal on srcNode
			
 
				+      Collection<Block> excessBlocks = 
			
 
				+        excessReplicateMap.get(node.getStorageID());
			
 
				+      if(excessBlocks != null && excessBlocks.contains(block))
			
 
				+        continue;
			
 
				+      // never use already decommissioned nodes
			
 
				+      if(node.isDecommissioned())
			
 
				+        continue;
			
 
				+      // we prefer nodes that are in DECOMMISSION_INPROGRESS state
			
 
				+      if(node.isDecommissionInProgress() || srcNode == null) {
			
 
				+        srcNode = node;
			
 
				+        continue;
			
 
				+      }
			
 
				+      if(srcNode.isDecommissionInProgress())
			
 
				+        continue;
			
 
				+      // switch to a different node randomly
			
 
				+      // this to prevent from deterministically selecting the same node even
			
 
				+      // if the node failed to replicate the block on previous iterations
			
 
				+      if(r.nextBoolean())
			
 
				+        srcNode = node;
			
 
				+    }
			
 
				+    if(numReplicas != null)
			
 
				+      numReplicas.initialize(live, decommissioned);
			
 
				+    return srcNode;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Get blocks to invalidate for the first node 
			
 
				+   * in {@link #recentInvalidateSets}.
			
 
				+   * 
			
 
				+   * @return number of blocks scheduled for removal during this iteration.
			
 
				+   */
			
 
				+  private synchronized int invalidateWorkForOneNode() {
			
 
				+    // blocks should not be replicated or removed if safe mode is on
			
 
				+    if (isInSafeMode())
			
 
				+      return 0;
			
 
				+    if(recentInvalidateSets.isEmpty())
			
 
				+      return 0;
			
 
				+    // get blocks to invalidate for the first node
			
 
				+    String firstNodeId = recentInvalidateSets.keySet().iterator().next();
			
 
				+    assert firstNodeId != null;
			
 
				+    DatanodeDescriptor dn = datanodeMap.get(firstNodeId);
			
 
				+    Collection<Block> invalidateSet = recentInvalidateSets.remove(firstNodeId);
			
 
				+ 
			
 
				+    if(invalidateSet == null || dn == null)
			
 
				+      return 0;
			
 
				+
			
 
				+    ArrayList<Block> blocksToInvalidate = 
			
 
				+      new ArrayList<Block>(blockInvalidateLimit);
			
 
				+
			
 
				+    // # blocks that can be sent in one message is limited
			
 
				+    Iterator<Block> it = invalidateSet.iterator();
			
 
				+    for(int blkCount = 0; blkCount < blockInvalidateLimit && it.hasNext();
			
 
				+                                                                blkCount++) {
			
 
				+      blocksToInvalidate.add(it.next());
			
 
				+      it.remove();
			
 
				     }
			
 
				+
			
 
				+    // If we could not send everything in this message, reinsert this item
			
 
				+    // into the collection.
			
 
				+    if(it.hasNext())
			
 
				+      recentInvalidateSets.put(firstNodeId, invalidateSet);
			
 
				+
			
 
				+    dn.addBlocksToBeInvalidated(blocksToInvalidate);
			
 
				+
			
 
				+    if(NameNode.stateChangeLog.isInfoEnabled()) {
			
 
				+      StringBuffer blockList = new StringBuffer();
			
 
				+      for(Block blk : blocksToInvalidate) {
			
 
				+        blockList.append(' ');
			
 
				+        blockList.append(blk.getBlockName());
			
 
				+      }
			
 
				+      NameNode.stateChangeLog.info("BLOCK* ask "
			
 
				+          + dn.getName() + " to delete " + blockList);
			
 
				+    }
			
 
				+    return blocksToInvalidate.size();
			
 
				+  }
			
 
				+
			
 
				+  void setNodeReplicationLimit(int limit) {
			
 
				+    this.maxReplicationStreams = limit;
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -2325,36 +2523,6 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				     }
			
 
				   }
			
 
				 
			
 
				-  /**
			
 
				-   * Add more replication work for this datanode.
			
 
				-   */
			
 
				-  synchronized void addBlocksToBeReplicated(DatanodeDescriptor node, 
			
 
				-                                            Block[] blocklist,
			
 
				-                                            DatanodeDescriptor[][] targets) 
			
 
				-    throws IOException {
			
 
				-    //
			
 
				-    // Find the datanode with the FSNamesystem lock held.
			
 
				-    //
			
 
				-    DatanodeDescriptor n = getDatanode(node);
			
 
				-    if (n != null) {
			
 
				-      n.addBlocksToBeReplicated(blocklist, targets);
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				-  /**
			
 
				-   * Add more block invalidation work for this datanode.
			
 
				-   */
			
 
				-  synchronized void addBlocksToBeInvalidated(DatanodeDescriptor node, 
			
 
				-                                             Block[] blocklist) throws IOException {
			
 
				-    //
			
 
				-    // Find the datanode with the FSNamesystem lock held.
			
 
				-    //
			
 
				-    DatanodeDescriptor n = getDatanode(node);
			
 
				-    if (n != null) {
			
 
				-      n.addBlocksToBeInvalidated(blocklist);
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				   /**
			
 
				    * remove a datanode descriptor
			
 
				    * @param nodeID datanode ID
			
@@ -3125,78 +3293,23 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				   short getMinReplication()     { return (short)minReplication; }
			
 
				   short getDefaultReplication() { return (short)defaultReplication; }
			
 
				     
			
 
				-  /////////////////////////////////////////////////////////
			
 
				-  //
			
 
				-  // These methods are called by the Namenode system, to see
			
 
				-  // if there is any work for a given datanode.
			
 
				-  //
			
 
				-  /////////////////////////////////////////////////////////
			
 
				-
			
 
				-  /**
			
 
				-   * Check if there are any recently-deleted blocks a datanode should remove.
			
 
				-   */
			
 
				-  public synchronized Block[] blocksToInvalidate(DatanodeID nodeID) {
			
 
				-    // Ask datanodes to perform block delete  
			
 
				-    // only if safe mode is off.
			
 
				-    if (isInSafeMode())
			
 
				-      return null;
			
 
				-       
			
 
				-    Collection<Block> invalidateSet = recentInvalidateSets.remove(
			
 
				-                                                                  nodeID.getStorageID());
			
 
				- 
			
 
				-    if (invalidateSet == null) {
			
 
				-      return null;
			
 
				-    }
			
 
				-
			
 
				-    Iterator<Block> it = null;
			
 
				-    int sendNum = invalidateSet.size();
			
 
				-    ArrayList<Block> sendBlock = new ArrayList<Block>(sendNum);
			
 
				-
			
 
				-    //
			
 
				-    // calculate the number of blocks that we send in one message
			
 
				-    //
			
 
				-    sendNum = Math.min(sendNum, blockInvalidateLimit);
			
 
				-    
			
 
				-    //
			
 
				-    // Copy the first chunk into sendBlock
			
 
				-    //
			
 
				-    for (it = invalidateSet.iterator(); sendNum > 0; sendNum--) {
			
 
				-      assert(it.hasNext());
			
 
				-      sendBlock.add(it.next());
			
 
				-      it.remove();
			
 
				-    }
			
 
				-
			
 
				-    //
			
 
				-    // If we could not send everything in this message, reinsert this item
			
 
				-    // into the collection.
			
 
				-    //
			
 
				-    if (it.hasNext()) {
			
 
				-      recentInvalidateSets.put(nodeID.getStorageID(), invalidateSet);
			
 
				-    }
			
 
				-        
			
 
				-    if (NameNode.stateChangeLog.isInfoEnabled()) {
			
 
				-      StringBuffer blockList = new StringBuffer();
			
 
				-      for (int i = 0; i < sendBlock.size(); i++) {
			
 
				-        blockList.append(' ');
			
 
				-        Block block = sendBlock.get(i);
			
 
				-        blockList.append(block.getBlockName());
			
 
				-      }
			
 
				-      NameNode.stateChangeLog.info("BLOCK* NameSystem.blockToInvalidate: "
			
 
				-                                   +"ask "+nodeID.getName()+" to delete " + blockList);
			
 
				-    }
			
 
				-    return sendBlock.toArray(new Block[sendBlock.size()]);
			
 
				-  }
			
 
				-
			
 
				-
			
 
				   /**
			
 
				    * A immutable object that stores the number of live replicas and
			
 
				    * the number of decommissined Replicas.
			
 
				    */
			
 
				-  static class NumberReplicas {
			
 
				+  private static class NumberReplicas {
			
 
				     private int liveReplicas;
			
 
				     private int decommissionedReplicas;
			
 
				 
			
 
				+    NumberReplicas() {
			
 
				+      initialize(0, 0);
			
 
				+    }
			
 
				+
			
 
				     NumberReplicas(int live, int decommissioned) {
			
 
				+      initialize(live, decommissioned);
			
 
				+    }
			
 
				+
			
 
				+    void initialize(int live, int decommissioned) {
			
 
				       liveReplicas = live;
			
 
				       decommissionedReplicas = decommissioned;
			
 
				     }
			
@@ -3235,32 +3348,6 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				     return countNodes(blocksMap.nodeIterator(b));
			
 
				   }
			
 
				 
			
 
				-  /**
			
 
				-   * Returns a newly allocated list of all nodes. Returns a count of
			
 
				-   * live and decommissioned nodes.
			
 
				-   */
			
 
				-  ArrayList<DatanodeDescriptor> containingNodeList(Block b, NumberReplicas[] numReplicas) {
			
 
				-    ArrayList<DatanodeDescriptor> nodeList = 
			
 
				-      new ArrayList<DatanodeDescriptor>();
			
 
				-    int count = 0;
			
 
				-    int live = 0;
			
 
				-    for(Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(b);
			
 
				-        it.hasNext();) {
			
 
				-      DatanodeDescriptor node = it.next();
			
 
				-      if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
			
 
				-        live++;
			
 
				-      }
			
 
				-      else {
			
 
				-        count++;
			
 
				-      }
			
 
				-      nodeList.add(node);
			
 
				-    }
			
 
				-    if (numReplicas != null) {
			
 
				-      numReplicas[0] = new NumberReplicas(live, count);
			
 
				-    }
			
 
				-    return nodeList;
			
 
				-  }
			
 
				-
			
 
				   /**
			
 
				    * Return true if there are any blocks on this node that have not
			
 
				    * yet reached their replication factor. Otherwise returns false.
			
@@ -3318,140 +3405,6 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
				     return false;
			
 
				   }
			
 
				 
			
 
				-  /**
			
 
				-   * Return with a list of Block/DataNodeInfo sets, indicating
			
 
				-   * where various Blocks should be copied, ASAP.
			
 
				-   *
			
 
				-   * The Array that we return consists of two objects:
			
 
				-   * The 1st elt is an array of Blocks.
			
 
				-   * The 2nd elt is a 2D array of DatanodeDescriptor objs, identifying the
			
 
				-   *     target sequence for the Block at the appropriate index.
			
 
				-   *
			
 
				-   */
			
 
				-  public synchronized Object[] pendingTransfers(DatanodeID srcNode,
			
 
				-                                                int needed) {
			
 
				-    // Ask datanodes to perform block replication  
			
 
				-    // only if safe mode is off.
			
 
				-    if (isInSafeMode())
			
 
				-      return null;
			
 
				-    
			
 
				-    synchronized (neededReplications) {
			
 
				-      Object results[] = null;
			
 
				-
			
 
				-      if (neededReplications.size() > 0) {
			
 
				-        //
			
 
				-        // Go through all blocks that need replications. See if any
			
 
				-        // are present at the current node. If so, ask the node to
			
 
				-        // replicate them.
			
 
				-        //
			
 
				-        List<Block> replicateBlocks = new ArrayList<Block>();
			
 
				-        List<NumberReplicas> numCurrentReplicas = new ArrayList<NumberReplicas>();
			
 
				-        List<DatanodeDescriptor[]> replicateTargetSets;
			
 
				-        replicateTargetSets = new ArrayList<DatanodeDescriptor[]>();
			
 
				-        NumberReplicas[] allReplicas = new NumberReplicas[1];
			
 
				-        for (Iterator<Block> it = neededReplications.iterator(); it.hasNext();) {
			
 
				-          if (needed <= 0) {
			
 
				-            break;
			
 
				-          }
			
 
				-          Block block = it.next();
			
 
				-          long blockSize = block.getNumBytes();
			
 
				-          INodeFile fileINode = blocksMap.getINode(block);
			
 
				-          if (fileINode == null) { // block does not belong to any file
			
 
				-            it.remove();
			
 
				-          } else {
			
 
				-            List<DatanodeDescriptor> containingNodes = 
			
 
				-              containingNodeList(block, allReplicas);
			
 
				-            Collection<Block> excessBlocks = excessReplicateMap.get(
			
 
				-                                                                    srcNode.getStorageID());
			
 
				-
			
 
				-            // srcNode must contain the block, and the block must
			
 
				-            // not be scheduled for removal on that node
			
 
				-            if (containingNodes.contains(srcNode)
			
 
				-                && (excessBlocks == null || !excessBlocks.contains(block))) {
			
 
				-              int numCurrentReplica = allReplicas[0].liveReplicas() +
			
 
				-                pendingReplications.getNumReplicas(block);
			
 
				-              NumberReplicas repl = new NumberReplicas(numCurrentReplica,
			
 
				-                                        allReplicas[0].decommissionedReplicas()); 
			
 
				-              if (numCurrentReplica >= fileINode.getReplication()) {
			
 
				-                it.remove();
			
 
				-              } else {
			
 
				-                DatanodeDescriptor targets[] = replicator.chooseTarget(
			
 
				-                                                                       Math.min(fileINode.getReplication() - numCurrentReplica,
			
 
				-                                                                                needed),
			
 
				-                                                                       datanodeMap.get(srcNode.getStorageID()),
			
 
				-                                                                       containingNodes, null, blockSize);
			
 
				-                if (targets.length > 0) {
			
 
				-                  // Build items to return
			
 
				-                  replicateBlocks.add(block);
			
 
				-                  numCurrentReplicas.add(repl);
			
 
				-                  replicateTargetSets.add(targets);
			
 
				-                  needed -= targets.length;
			
 
				-                }
			
 
				-              }
			
 
				-            }
			
 
				-          }
			
 
				-        }
			
 
				-
			
 
				-        //
			
 
				-        // Move the block-replication into a "pending" state.
			
 
				-        // The reason we use 'pending' is so we can retry
			
 
				-        // replications that fail after an appropriate amount of time.
			
 
				-        // (REMIND - mjc - this timer is not yet implemented.)
			
 
				-        //
			
 
				-        if (replicateBlocks.size() > 0) {
			
 
				-          int i = 0;
			
 
				-          for (Iterator<Block> it = replicateBlocks.iterator(); it.hasNext(); i++) {
			
 
				-            Block block = it.next();
			
 
				-            DatanodeDescriptor targets[] = replicateTargetSets.get(i);
			
 
				-            int numCurrentReplica = numCurrentReplicas.get(i).liveReplicas();
			
 
				-            int numExpectedReplica = blocksMap.getINode(block).getReplication(); 
			
 
				-            if (numCurrentReplica + targets.length >= numExpectedReplica) {
			
 
				-              neededReplications.remove(
			
 
				-                                        block, 
			
 
				-                                        numCurrentReplica, 
			
 
				-                                        numCurrentReplicas.get(i).decommissionedReplicas(),
			
 
				-                                        numExpectedReplica);
			
 
				-              pendingReplications.add(block, targets.length);
			
 
				-              NameNode.stateChangeLog.debug(
			
 
				-                                            "BLOCK* NameSystem.pendingTransfer: "
			
 
				-                                            + block.getBlockName()
			
 
				-                                            + " is removed from neededReplications to pendingReplications");
			
 
				-            }
			
 
				-
			
 
				-            if (NameNode.stateChangeLog.isInfoEnabled()) {
			
 
				-              StringBuffer targetList = new StringBuffer("datanode(s)");
			
 
				-              for (int k = 0; k < targets.length; k++) {
			
 
				-                targetList.append(' ');
			
 
				-                targetList.append(targets[k].getName());
			
 
				-              }
			
 
				-              NameNode.stateChangeLog.info(
			
 
				-                                           "BLOCK* NameSystem.pendingTransfer: " + "ask "
			
 
				-                                           + srcNode.getName() + " to replicate "
			
 
				-                                           + block.getBlockName() + " to " + targetList);
			
 
				-              NameNode.stateChangeLog.debug(
			
 
				-                                            "BLOCK* neededReplications = " + neededReplications.size()
			
 
				-                                            + " pendingReplications = " + pendingReplications.size());
			
 
				-            }
			
 
				-          }
			
 
				-
			
 
				-          //
			
 
				-          // Build returned objects from above lists
			
 
				-          //
			
 
				-          DatanodeDescriptor targetMatrix[][] = 
			
 
				-            new DatanodeDescriptor[replicateTargetSets.size()][];
			
 
				-          for (i = 0; i < targetMatrix.length; i++) {
			
 
				-            targetMatrix[i] = replicateTargetSets.get(i);
			
 
				-          }
			
 
				-
			
 
				-          results = new Object[2];
			
 
				-          results[0] = replicateBlocks.toArray(new Block[replicateBlocks.size()]);
			
 
				-          results[1] = targetMatrix;
			
 
				-        }
			
 
				-      }
			
 
				-      return results;
			
 
				-    }
			
 
				-  }
			
 
				-  
			
 
				   /** 
			
 
				    * Keeps track of which datanodes are allowed to connect to the namenode.
			
 
				    */
			
--- a/src/java/org/apache/hadoop/dfs/PendingReplicationBlocks.java
+++ b/src/java/org/apache/hadoop/dfs/PendingReplicationBlocks.java
@@ -100,7 +100,7 @@ class PendingReplicationBlocks {
 
				   /**
			
 
				    * The total number of blocks that are undergoing replication
			
 
				    */
			
 
				-  long size() {
			
 
				+  int size() {
			
 
				     return pendingReplications.size();
			
 
				   } 
			
 
				 
			
--- a/src/java/org/apache/hadoop/dfs/Storage.java
+++ b/src/java/org/apache/hadoop/dfs/Storage.java
@@ -298,7 +298,7 @@ abstract class Storage extends StorageInfo {
 
				           return StorageState.NORMAL;
			
 
				         if (hasPrevious)
			
 
				           throw new InconsistentFSStateException(root,
			
 
				-                                                 "version file in current directory it is missing.");
			
 
				+                              "version file in current directory is missing.");
			
 
				         return StorageState.NOT_FORMATTED;
			
 
				       }
			
 
				 
			
--- a/src/java/org/apache/hadoop/dfs/UnderReplicatedBlocks.java
+++ b/src/java/org/apache/hadoop/dfs/UnderReplicatedBlocks.java
@@ -23,7 +23,7 @@ import java.util.*;
 
				  * Blocks have replication priority, with priority 0 indicating the highest
			
 
				  * Blocks have only one replicas has the highest
			
 
				  */
			
 
				-class UnderReplicatedBlocks {
			
 
				+class UnderReplicatedBlocks implements Iterable<Block> {
			
 
				   private static final int LEVEL = 3;
			
 
				   private List<TreeSet<Block>> priorityQueues = new ArrayList<TreeSet<Block>>();
			
 
				       
			
@@ -173,7 +173,7 @@ class UnderReplicatedBlocks {
 
				   }
			
 
				       
			
 
				   /* return a iterator of all the under replication blocks */
			
 
				-  synchronized Iterator<Block> iterator() {
			
 
				+  public synchronized Iterator<Block> iterator() {
			
 
				     return new Iterator<Block>() {
			
 
				       private int level;
			
 
				       private List<Iterator<Block>> iterators = new ArrayList<Iterator<Block>>();
			
--- a/src/test/org/apache/hadoop/dfs/NNThroughputBenchmark.java
+++ b/src/test/org/apache/hadoop/dfs/NNThroughputBenchmark.java
@@ -17,6 +17,8 @@
 
				  */
			
 
				 package org.apache.hadoop.dfs;
			
 
				 
			
 
				+import java.io.File;
			
 
				+import java.io.FileOutputStream;
			
 
				 import java.io.IOException;
			
 
				 import java.util.Arrays;
			
 
				 import java.util.List;
			
@@ -77,6 +79,14 @@ public class NNThroughputBenchmark implements FSConstants {
 
				     // We do not need many handlers, since each thread simulates a handler
			
 
				     // by calling name-node methods directly
			
 
				     config.setInt("dfs.namenode.handler.count", 1);
			
 
				+    // set exclude file
			
 
				+    config.set("dfs.hosts.exclude", "${hadoop.tmp.dir}/dfs/hosts/exclude");
			
 
				+    File excludeFile = new File(config.get("dfs.hosts.exclude", "exclude"));
			
 
				+    if(! excludeFile.exists()) {
			
 
				+      if(!excludeFile.getParentFile().mkdirs())
			
 
				+        throw new IOException("NNThroughputBenchmark: cannot mkdir " + excludeFile);
			
 
				+    }
			
 
				+    new FileOutputStream(excludeFile).close();
			
 
				     // Start the NameNode
			
 
				     String[] args = new String[] {};
			
 
				     nameNode = NameNode.createNameNode(args, config);
			
@@ -113,6 +123,8 @@ public class NNThroughputBenchmark implements FSConstants {
 
				     protected long cumulativeTime = 0;    // sum of times for each op
			
 
				     protected long elapsedTime = 0;       // time from start to finish
			
 
				 
			
 
				+    protected List<StatsDaemon> daemons;
			
 
				+
			
 
				     /**
			
 
				      * Operation name.
			
 
				      */
			
@@ -155,6 +167,11 @@ public class NNThroughputBenchmark implements FSConstants {
 
				      */
			
 
				     abstract long executeOp(int daemonId, int inputIdx, String arg1) throws IOException;
			
 
				 
			
 
				+    /**
			
 
				+     * Print the results of the benchmarking.
			
 
				+     */
			
 
				+    abstract void printResults();
			
 
				+
			
 
				     OperationStatsBase() {
			
 
				       baseDir = BASE_DIR_NAME + "/" + getOpName();
			
 
				       replication = (short) config.getInt("dfs.replication", 3);
			
@@ -163,7 +180,7 @@ public class NNThroughputBenchmark implements FSConstants {
 
				     }
			
 
				 
			
 
				     void benchmark() throws IOException {
			
 
				-      List<StatsDaemon> daemons = new ArrayList<StatsDaemon>();
			
 
				+      daemons = new ArrayList<StatsDaemon>();
			
 
				       long start = 0;
			
 
				       try {
			
 
				         numOpsExecuted = 0;
			
@@ -191,7 +208,7 @@ public class NNThroughputBenchmark implements FSConstants {
 
				         for(StatsDaemon d : daemons)
			
 
				           d.start();
			
 
				       } finally {
			
 
				-        while(isInPorgress(daemons)) {
			
 
				+        while(isInPorgress()) {
			
 
				           // try {Thread.sleep(500);} catch (InterruptedException e) {}
			
 
				         }
			
 
				         elapsedTime = System.currentTimeMillis() - start;
			
@@ -202,9 +219,9 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       }
			
 
				     }
			
 
				 
			
 
				-    private boolean isInPorgress(List<StatsDaemon> daemons) {
			
 
				+    private boolean isInPorgress() {
			
 
				       for(StatsDaemon d : daemons)
			
 
				-        if(d.isInPorgress())
			
 
				+        if(d.isInProgress())
			
 
				           return true;
			
 
				       return false;
			
 
				     }
			
@@ -269,7 +286,7 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       return false;
			
 
				     }
			
 
				 
			
 
				-    void printResults() {
			
 
				+    void printStats() {
			
 
				       LOG.info("--- " + getOpName() + " stats  ---");
			
 
				       LOG.info("# operations: " + getNumOpsExecuted());
			
 
				       LOG.info("Elapsed Time: " + getElapsedTime());
			
@@ -293,7 +310,6 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       this.daemonId = daemonId;
			
 
				       this.opsPerThread = nrOps;
			
 
				       this.statsOp = op;
			
 
				-      // this.clientName = statsOp.getClientName(daemonId);
			
 
				       setName(toString());
			
 
				     }
			
 
				 
			
@@ -322,9 +338,16 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       }
			
 
				     }
			
 
				 
			
 
				-    boolean isInPorgress() {
			
 
				+    boolean isInProgress() {
			
 
				       return localNumOpsExecuted < opsPerThread;
			
 
				     }
			
 
				+
			
 
				+    /**
			
 
				+     * Schedule to stop this daemon.
			
 
				+     */
			
 
				+    void terminate() {
			
 
				+      opsPerThread = localNumOpsExecuted;
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -379,10 +402,8 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       long fNum = fileCount % filesPerDirectory;
			
 
				       if(fNum == 0) {
			
 
				         currentDir = getNextDirName();
			
 
				-        // System.out.println("currentDir: " + currentDir);
			
 
				       }
			
 
				       String fn = currentDir + "/" + FILE_NAME_PREFFIX + fileCount;
			
 
				-      // System.out.println("getNextFileName(): " + fn + " fileCount = " + fileCount);
			
 
				       fileCount++;
			
 
				       return fn;
			
 
				     }
			
@@ -481,7 +502,7 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       LOG.info("nrFiles = " + numOpsRequired);
			
 
				       LOG.info("nrThreads = " + numThreads);
			
 
				       LOG.info("nrFilesPerDir = " + nameGenerator.filesPerDirectory);
			
 
				-      super.printResults();
			
 
				+      printStats();
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -538,6 +559,8 @@ public class NNThroughputBenchmark implements FSConstants {
 
				   private static class TinyDatanode implements Comparable<String> {
			
 
				     private static final long DF_CAPACITY = 100*1024*1024;
			
 
				     private static final long DF_USED = 0;
			
 
				+    
			
 
				+    NamespaceInfo nsInfo;
			
 
				     DatanodeRegistration dnRegistration;
			
 
				     Block[] blocks;
			
 
				     int nrBlocks; // actual number of blocks
			
@@ -563,15 +586,23 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       this.nrBlocks = 0;
			
 
				     }
			
 
				 
			
 
				+    String getName() {
			
 
				+      return dnRegistration.getName();
			
 
				+    }
			
 
				+
			
 
				     void register() throws IOException {
			
 
				       // get versions from the namenode
			
 
				-      NamespaceInfo nsInfo = nameNode.versionRequest();
			
 
				+      nsInfo = nameNode.versionRequest();
			
 
				       dnRegistration.setStorageInfo(new DataStorage(nsInfo, ""));
			
 
				       DataNode.setNewStorageID(dnRegistration);
			
 
				       // register datanode
			
 
				       dnRegistration = nameNode.register(dnRegistration);
			
 
				     }
			
 
				 
			
 
				+    /**
			
 
				+     * Send a heartbeat to the name-node.
			
 
				+     * Ignore reply commands.
			
 
				+     */
			
 
				     void sendHeartbeat() throws IOException {
			
 
				       // register datanode
			
 
				       DatanodeCommand cmd = nameNode.sendHeartbeat(
			
@@ -597,7 +628,46 @@ public class NNThroughputBenchmark implements FSConstants {
 
				     }
			
 
				 
			
 
				     public int compareTo(String name) {
			
 
				-      return dnRegistration.getName().compareTo(name);
			
 
				+      return getName().compareTo(name);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Send a heartbeat to the name-node and replicate blocks if requested.
			
 
				+     */
			
 
				+    int replicateBlocks() throws IOException {
			
 
				+      // register datanode
			
 
				+      DatanodeCommand cmd = nameNode.sendHeartbeat(
			
 
				+          dnRegistration, DF_CAPACITY, DF_USED, DF_CAPACITY - DF_USED, 0, 0);
			
 
				+      if(cmd == null || cmd.getAction() != DatanodeProtocol.DNA_TRANSFER)
			
 
				+        return 0;
			
 
				+      // Send a copy of a block to another datanode
			
 
				+      BlockCommand bcmd = (BlockCommand)cmd;
			
 
				+      return transferBlocks(bcmd.getBlocks(), bcmd.getTargets());
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Transfer blocks to another data-node.
			
 
				+     * Just report on behalf of the other data-node
			
 
				+     * that the blocks have been received.
			
 
				+     */
			
 
				+    private int transferBlocks( Block blocks[], 
			
 
				+                                DatanodeInfo xferTargets[][] 
			
 
				+                              ) throws IOException {
			
 
				+      for(int i = 0; i < blocks.length; i++) {
			
 
				+        DatanodeInfo blockTargets[] = xferTargets[i];
			
 
				+        for(int t = 0; t < blockTargets.length; t++) {
			
 
				+          DatanodeInfo dnInfo = blockTargets[t];
			
 
				+          DatanodeRegistration receivedDNReg;
			
 
				+          receivedDNReg = new DatanodeRegistration(dnInfo.getName());
			
 
				+          receivedDNReg.setStorageInfo(
			
 
				+                          new DataStorage(nsInfo, dnInfo.getStorageID()));
			
 
				+          receivedDNReg.setInfoPort(dnInfo.getInfoPort());
			
 
				+          nameNode.blockReceived( receivedDNReg, 
			
 
				+                                  new Block[] {blocks[i]},
			
 
				+                                  new String[] {DataNode.EMPTY_DEL_HINT});
			
 
				+        }
			
 
				+      }
			
 
				+      return blocks.length;
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -611,7 +681,8 @@ public class NNThroughputBenchmark implements FSConstants {
 
				   class BlockReportStats extends OperationStatsBase {
			
 
				     static final String OP_BLOCK_REPORT_NAME = "blockReport";
			
 
				     static final String OP_BLOCK_REPORT_USAGE = 
			
 
				-      "-op blockReport [-datanodes T] [-reports R] [-blocksPerReport B] [-blocksPerFile F]";
			
 
				+      "-op blockReport [-datanodes T] [-reports N] " +
			
 
				+      "[-blocksPerReport B] [-blocksPerFile F]";
			
 
				 
			
 
				     private int blocksPerReport;
			
 
				     private int blocksPerFile;
			
@@ -670,10 +741,10 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       for(int idx=0; idx < nrDatanodes; idx++) {
			
 
				         datanodes[idx] = new TinyDatanode(idx, blocksPerReport);
			
 
				         datanodes[idx].register();
			
 
				-        assert datanodes[idx].dnRegistration.getName().compareTo(prevDNName) > 0
			
 
				+        assert datanodes[idx].getName().compareTo(prevDNName) > 0
			
 
				           : "Data-nodes must be sorted lexicographically.";
			
 
				         datanodes[idx].sendHeartbeat();
			
 
				-        prevDNName = datanodes[idx].dnRegistration.getName();
			
 
				+        prevDNName = datanodes[idx].getName();
			
 
				       }
			
 
				       int numResolved = 0;
			
 
				       DatanodeInfo[] dnInfos = nameNode.getDatanodeReport(DatanodeReportType.ALL);
			
@@ -741,14 +812,6 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       return end-start;
			
 
				     }
			
 
				 
			
 
				-    /**
			
 
				-     * Defines data-node name since client are data-nodes in this case.
			
 
				-     */
			
 
				-    @Override
			
 
				-    String getClientName(int idx) {
			
 
				-      return getOpName() + "-client-" + idx;
			
 
				-    }
			
 
				-
			
 
				     void printResults() {
			
 
				       String blockDistribution = "";
			
 
				       String delim = "(";
			
@@ -762,9 +825,164 @@ public class NNThroughputBenchmark implements FSConstants {
 
				       LOG.info("datanodes = " + numThreads + " " + blockDistribution);
			
 
				       LOG.info("blocksPerReport = " + blocksPerReport);
			
 
				       LOG.info("blocksPerFile = " + blocksPerFile);
			
 
				-      super.printResults();
			
 
				+      printStats();
			
 
				+    }
			
 
				+  }   // end BlockReportStats
			
 
				+
			
 
				+  /**
			
 
				+   * Measures how fast replication monitor can compute data-node work.
			
 
				+   * 
			
 
				+   * It runs only one thread until no more work can be scheduled.
			
 
				+   */
			
 
				+  class ReplicationStats extends OperationStatsBase {
			
 
				+    static final String OP_REPLICATION_NAME = "replication";
			
 
				+    static final String OP_REPLICATION_USAGE = 
			
 
				+      "-op replication [-datanodes T] [-nodesToDecommission D] " +
			
 
				+      "[-nodeReplicationLimit C] [-totalBlocks B] [-replication R]";
			
 
				+
			
 
				+    private BlockReportStats blockReportObject;
			
 
				+    private int numDatanodes;
			
 
				+    private int nodesToDecommission;
			
 
				+    private int nodeReplicationLimit;
			
 
				+    private int totalBlocks;
			
 
				+    private int numDecommissionedBlocks;
			
 
				+    private int numPendingBlocks;
			
 
				+
			
 
				+    ReplicationStats(String[] args) {
			
 
				+      super();
			
 
				+      numThreads = 1;
			
 
				+      numDatanodes = 3;
			
 
				+      nodesToDecommission = 1;
			
 
				+      nodeReplicationLimit = 100;
			
 
				+      totalBlocks = 100;
			
 
				+      parseArguments(args);
			
 
				+      // number of operations is 4 times the number of decommissioned
			
 
				+      // blocks divided by the number of needed replications scanned 
			
 
				+      // by the replication monitor in one iteration
			
 
				+      numOpsRequired = (totalBlocks*replication*nodesToDecommission*2)
			
 
				+            / (numDatanodes*numDatanodes);
			
 
				+
			
 
				+      String[] blkReportArgs = {
			
 
				+        "-op", "blockReport",
			
 
				+        "-datanodes", String.valueOf(numDatanodes),
			
 
				+        "-blocksPerReport", String.valueOf(totalBlocks*replication/numDatanodes),
			
 
				+        "-blocksPerFile", String.valueOf(numDatanodes)};
			
 
				+      blockReportObject = new BlockReportStats(blkReportArgs);
			
 
				+      numDecommissionedBlocks = 0;
			
 
				+      numPendingBlocks = 0;
			
 
				+    }
			
 
				+
			
 
				+    String getOpName() {
			
 
				+      return OP_REPLICATION_NAME;
			
 
				     }
			
 
				-  }
			
 
				+
			
 
				+    void parseArguments(String[] args) {
			
 
				+      boolean ignoreUnrelatedOptions = verifyOpArgument(args);
			
 
				+      for (int i = 2; i < args.length; i++) {       // parse command line
			
 
				+        if(args[i].equals("-datanodes")) {
			
 
				+          if(i+1 == args.length)  printUsage();
			
 
				+          numDatanodes = Integer.parseInt(args[++i]);
			
 
				+        } else if(args[i].equals("-nodesToDecommission")) {
			
 
				+          if(i+1 == args.length)  printUsage();
			
 
				+          nodesToDecommission = Integer.parseInt(args[++i]);
			
 
				+        } else if(args[i].equals("-nodeReplicationLimit")) {
			
 
				+          if(i+1 == args.length)  printUsage();
			
 
				+          nodeReplicationLimit = Integer.parseInt(args[++i]);
			
 
				+        } else if(args[i].equals("-totalBlocks")) {
			
 
				+          if(i+1 == args.length)  printUsage();
			
 
				+          totalBlocks = Integer.parseInt(args[++i]);
			
 
				+        } else if(args[i].equals("-replication")) {
			
 
				+          if(i+1 == args.length)  printUsage();
			
 
				+          replication = Short.parseShort(args[++i]);
			
 
				+        } else if(!ignoreUnrelatedOptions)
			
 
				+          printUsage();
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    void generateInputs(int[] ignore) throws IOException {
			
 
				+      // start data-nodes; create a bunch of files; generate block reports.
			
 
				+      blockReportObject.generateInputs(ignore);
			
 
				+      // stop replication monitor
			
 
				+      nameNode.namesystem.replthread.interrupt();
			
 
				+      try {
			
 
				+        nameNode.namesystem.replthread.join();
			
 
				+      } catch(InterruptedException ei) {
			
 
				+        return;
			
 
				+      }
			
 
				+      // report blocks once
			
 
				+      int nrDatanodes = blockReportObject.getNumDatanodes();
			
 
				+      for(int idx=0; idx < nrDatanodes; idx++) {
			
 
				+        blockReportObject.executeOp(idx, 0, null);
			
 
				+      }
			
 
				+      // decommission data-nodes
			
 
				+      decommissionNodes();
			
 
				+      // set node replication limit
			
 
				+      nameNode.namesystem.setNodeReplicationLimit(nodeReplicationLimit);
			
 
				+    }
			
 
				+
			
 
				+    private void decommissionNodes() throws IOException {
			
 
				+      String excludeFN = config.get("dfs.hosts.exclude", "exclude");
			
 
				+      FileOutputStream excludeFile = new FileOutputStream(excludeFN);
			
 
				+      excludeFile.getChannel().truncate(0L);
			
 
				+      int nrDatanodes = blockReportObject.getNumDatanodes();
			
 
				+      numDecommissionedBlocks = 0;
			
 
				+      for(int i=0; i < nodesToDecommission; i++) {
			
 
				+        TinyDatanode dn = blockReportObject.datanodes[nrDatanodes-1-i];
			
 
				+        numDecommissionedBlocks += dn.nrBlocks;
			
 
				+        excludeFile.write(dn.getName().getBytes());
			
 
				+        excludeFile.write('\n');
			
 
				+        LOG.info("Datanode " + dn.getName() + " is decommissioned.");
			
 
				+      }
			
 
				+      excludeFile.close();
			
 
				+      nameNode.refreshNodes();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Does not require the argument
			
 
				+     */
			
 
				+    String getExecutionArgument(int daemonId) {
			
 
				+      return null;
			
 
				+    }
			
 
				+
			
 
				+    long executeOp(int daemonId, int inputIdx, String ignore) throws IOException {
			
 
				+      assert daemonId < numThreads : "Wrong daemonId.";
			
 
				+      long start = System.currentTimeMillis();
			
 
				+      // compute datanode work
			
 
				+      int work = nameNode.namesystem.computeDatanodeWork();
			
 
				+      long end = System.currentTimeMillis();
			
 
				+      numPendingBlocks += work;
			
 
				+      if(work == 0)
			
 
				+        daemons.get(daemonId).terminate();
			
 
				+      return end-start;
			
 
				+    }
			
 
				+
			
 
				+    void printResults() {
			
 
				+      String blockDistribution = "";
			
 
				+      String delim = "(";
			
 
				+      int totalReplicas = 0;
			
 
				+      for(int idx=0; idx < blockReportObject.getNumDatanodes(); idx++) {
			
 
				+        totalReplicas += blockReportObject.datanodes[idx].nrBlocks;
			
 
				+        blockDistribution += delim + blockReportObject.datanodes[idx].nrBlocks;
			
 
				+        delim = ", ";
			
 
				+      }
			
 
				+      blockDistribution += ")";
			
 
				+      LOG.info("--- " + getOpName() + " inputs ---");
			
 
				+      LOG.info("numOpsRequired = " + numOpsRequired);
			
 
				+      LOG.info("datanodes = " + numDatanodes + " " + blockDistribution);
			
 
				+      LOG.info("decommissioned datanodes = " + nodesToDecommission);
			
 
				+      LOG.info("datanode replication limit = " + nodeReplicationLimit);
			
 
				+      LOG.info("total blocks = " + totalBlocks);
			
 
				+      printStats();
			
 
				+      LOG.info("decommissioned blocks = " + numDecommissionedBlocks);
			
 
				+      LOG.info("pending replications = " + numPendingBlocks);
			
 
				+      LOG.info("replications per sec: " + getBlocksPerSecond());
			
 
				+    }
			
 
				+
			
 
				+    private double getBlocksPerSecond() {
			
 
				+      return elapsedTime == 0 ? 0 : 1000*(double)numPendingBlocks / elapsedTime;
			
 
				+    }
			
 
				+
			
 
				+  }   // end ReplicationStats
			
 
				 
			
 
				   static void printUsage() {
			
 
				     System.err.println("Usage: NNThroughputBenchmark"
			
@@ -772,6 +990,7 @@ public class NNThroughputBenchmark implements FSConstants {
 
				         + " | \n\t" + CreateFileStats.OP_CREATE_USAGE
			
 
				         + " | \n\t" + OpenFileStats.OP_OPEN_USAGE
			
 
				         + " | \n\t" + BlockReportStats.OP_BLOCK_REPORT_USAGE
			
 
				+        + " | \n\t" + ReplicationStats.OP_REPLICATION_USAGE
			
 
				     );
			
 
				     System.exit(-1);
			
 
				   }
			
@@ -804,6 +1023,10 @@ public class NNThroughputBenchmark implements FSConstants {
 
				         opStat = bench.new BlockReportStats(args);
			
 
				         ops.add(opStat);
			
 
				       }
			
 
				+      if(runAll || ReplicationStats.OP_REPLICATION_NAME.equals(type)) {
			
 
				+        opStat = bench.new ReplicationStats(args);
			
 
				+        ops.add(opStat);
			
 
				+      }
			
 
				       if(ops.size() == 0)
			
 
				         printUsage();
			
 
				       // run each bencmark