Browse Source

Merge -r 808671:809439 from trunk to bring its changes to the append branch.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hdfs/branches/HDFS-265@810323 13f79535-47bb-0310-9956-ffa450edef68
Hairong Kuang 15 năm trước cách đây
mục cha
commit
e7f903ea4d

+ 11 - 2
CHANGES.txt

@@ -29,6 +29,9 @@ Trunk (unreleased changes)
     HDFS-565. Introduce block committing logic during new block allocation
     and file close. (shv)
 
+    HDFS-492. Add two JSON JSP pages to the Namenode for providing corrupt
+    blocks/replicas information.  (Bill Zeller via szetszwo)
+
   IMPROVEMENTS
 
     HDFS-381. Remove blocks from DataNode maps when corresponding file
@@ -119,6 +122,7 @@ Trunk (unreleased changes)
     HDFS-552. Change TestFiDataTransferProtocol to junit 4 and add a few new
     tests.  (szetszwo)
 
+<<<<<<< .working
     HDFS-562. Add a test for NameNode.getBlockLocations(..) to check read from
     un-closed file.  (szetszwo)
 
@@ -129,6 +133,8 @@ Trunk (unreleased changes)
     to be executed by the run-test-hdfs-fault-inject target.  (Konstantin
     Boudnik via szetszwo)
 
+=======
+>>>>>>> .merge-right.r809439
     HDFS-563. Simplify the codes in FSNamesystem.getBlockLocations(..).
     (szetszwo)
 
@@ -194,8 +200,11 @@ Trunk (unreleased changes)
     HDFS-553. BlockSender reports wrong failed position in ChecksumException.
     (hairong)
 
-    HDFS-568. Update hadoop-mapred-examples-0.21.0-dev.jar for MAPREDUCE-874.
-    (szetszwo)
+    HDFS-568. Set mapred.job.tracker.retire.jobs to false in
+    src/test/mapred-site.xml for mapreduce tests to run.  (Amareshwari
+    Sriramadasu via szetszwo)
+ 
+    HDFS-15. All replicas end up on 1 rack. (Jitendra Nath Pandey via hairong)
  
 Release 0.20.1 - Unreleased
 

+ 0 - 2
build.xml

@@ -636,7 +636,6 @@
         </batchtest>
         <batchtest todir="${test.build.dir}" if="tests.testcase.fi">
           <fileset dir="${test.src.dir}/aop" includes="**/${testcase}.java"/>
-          <fileset dir="${test.src.dir}/hdfs" includes="**/${testcase}.java"/>
         </batchtest>
       </junit>
       <antcall target="checkfailure"/>
@@ -696,7 +695,6 @@
       </batchtest>
       <batchtest todir="${test.build.dir}" if="tests.testcase.fi">
         <fileset dir="${test.src.dir}/aop" includes="**/${testcase}.java"/>
-        <fileset dir="${test.src.dir}/hdfs-with-mr" includes="**/${testcase}.java"/>
       </batchtest>
     </junit>
     <antcall target="checkfailure"/>

+ 11 - 3
src/java/org/apache/hadoop/hdfs/server/datanode/DatanodeJspHelper.java

@@ -258,6 +258,10 @@ class DatanodeJspHelper {
     out.print("<B>Total number of blocks: " + blocks.size() + "</B><br>");
     // generate a table and dump the info
     out.println("\n<table>");
+    
+    String namenodeHost = datanode.getNameNodeAddr().getHostName();
+    String namenodeHostName = InetAddress.getByName(namenodeHost).getCanonicalHostName();
+    
     for (LocatedBlock cur : blocks) {
       out.print("<tr>");
       final String blockidstring = Long.toString(cur.getBlock().getBlockId());
@@ -277,14 +281,18 @@ class DatanodeJspHelper {
             + "&genstamp=" + cur.getBlock().getGenerationStamp()
             + "&namenodeInfoPort=" + namenodeInfoPort
             + "&chunkSizeToView=" + chunkSizeToView;
+
+        String blockInfoUrl = "http://" + namenodeHostName + ":"
+            + namenodeInfoPort
+            + "/block_info_xml.jsp?blockId=" + blockidstring;
         out.print("<td>&nbsp</td><td><a href=\"" + blockUrl + "\">"
-            + datanodeAddr + "</a></td>");
+            + datanodeAddr + "</a></td><td>"
+            + "<a href=\"" + blockInfoUrl + "\">View Block Info</a></td>");
       }
       out.println("</tr>");
     }
     out.println("</table>");
     out.print("<hr>");
-    String namenodeHost = datanode.getNameNodeAddr().getHostName();
     out.print("<br><a href=\"http://"
         + InetAddress.getByName(namenodeHost).getCanonicalHostName() + ":"
         + namenodeInfoPort + "/dfshealth.jsp\">Go back to DFS home</a>");
@@ -577,4 +585,4 @@ class DatanodeJspHelper {
     out.print("</textarea>");
     dfs.close();
   }
-}
+}

+ 159 - 30
src/java/org/apache/hadoop/hdfs/server/namenode/BlockManager.java

@@ -105,6 +105,9 @@ public class BlockManager {
   // Default number of replicas
   int defaultReplication;
 
+  // variable to enable check for enough racks 
+  boolean shouldCheckForEnoughRacks = true;
+
   /**
    * Last block index used for replication work.
    */
@@ -155,10 +158,13 @@ public class BlockManager {
                             + " must be less than dfs.replication.max = "
                             + maxReplication);
     this.maxReplicationStreams = conf.getInt("dfs.max-repl-streams", 2);
+    this.shouldCheckForEnoughRacks = conf.get("topology.script.file.name") == null ? false
+                                                                             : true;
     FSNamesystem.LOG.info("defaultReplication = " + defaultReplication);
     FSNamesystem.LOG.info("maxReplication = " + maxReplication);
     FSNamesystem.LOG.info("minReplication = " + minReplication);
     FSNamesystem.LOG.info("maxReplicationStreams = " + maxReplicationStreams);
+    FSNamesystem.LOG.info("shouldCheckForEnoughRacks = " + shouldCheckForEnoughRacks);
   }
 
   void activate() {
@@ -677,6 +683,7 @@ public class BlockManager {
     int requiredReplication, numEffectiveReplicas;
     List<DatanodeDescriptor> containingNodes;
     DatanodeDescriptor srcNode;
+    int additionalReplRequired;
 
     synchronized (namesystem) {
       synchronized (neededReplications) {
@@ -688,6 +695,7 @@ public class BlockManager {
           replIndex--;
           return false;
         }
+
         requiredReplication = fileINode.getReplication();
 
         // get a source data-node
@@ -704,21 +712,32 @@ public class BlockManager {
         // do not schedule more if enough replicas is already pending
         numEffectiveReplicas = numReplicas.liveReplicas() +
                                 pendingReplications.getNumReplicas(block);
-        if(numEffectiveReplicas >= requiredReplication) {
-          neededReplications.remove(block, priority); // remove from neededReplications
-          replIndex--;
-          NameNode.stateChangeLog.info("BLOCK* "
-              + "Removing block " + block
-              + " from neededReplications as it has enough replicas.");
-          return false;
+      
+        if (numEffectiveReplicas >= requiredReplication) {
+          if ( (pendingReplications.getNumReplicas(block) > 0) ||
+               (blockHasEnoughRacks(block)) ) {
+            neededReplications.remove(block, priority); // remove from neededReplications
+            replIndex--;
+            NameNode.stateChangeLog.info("BLOCK* "
+                + "Removing block " + block
+                + " from neededReplications as it has enough replicas.");
+            return false;
+          }
+        }
+
+        if (numReplicas.liveReplicas() < requiredReplication) {
+          additionalReplRequired = requiredReplication - numEffectiveReplicas;
+        } else {
+          additionalReplRequired = 1; //Needed on a new rack
         }
+
       }
     }
 
     // choose replication targets: NOT HOLDING THE GLOBAL LOCK
-    DatanodeDescriptor targets[] = replicator.chooseTarget(
-        requiredReplication - numEffectiveReplicas,
-        srcNode, containingNodes, null, block.getNumBytes());
+    DatanodeDescriptor targets[] = 
+                       replicator.chooseTarget(additionalReplRequired,
+                       srcNode, containingNodes, null, block.getNumBytes());
     if(targets.length == 0)
       return false;
 
@@ -739,13 +758,25 @@ public class BlockManager {
         NumberReplicas numReplicas = countNodes(block);
         numEffectiveReplicas = numReplicas.liveReplicas() +
         pendingReplications.getNumReplicas(block);
-        if(numEffectiveReplicas >= requiredReplication) {
-          neededReplications.remove(block, priority); // remove from neededReplications
-          replIndex--;
-          NameNode.stateChangeLog.info("BLOCK* "
-              + "Removing block " + block
-              + " from neededReplications as it has enough replicas.");
-          return false;
+
+        if (numEffectiveReplicas >= requiredReplication) {
+          if ( (pendingReplications.getNumReplicas(block) > 0) ||
+               (blockHasEnoughRacks(block)) ) {
+            neededReplications.remove(block, priority); // remove from neededReplications
+            replIndex--;
+            NameNode.stateChangeLog.info("BLOCK* "
+                + "Removing block " + block
+                + " from neededReplications as it has enough replicas.");
+            return false;
+          }
+        }
+
+        if ( (numReplicas.liveReplicas() >= requiredReplication) &&
+             (!blockHasEnoughRacks(block)) ) {
+          if (srcNode.getNetworkLocation().equals(targets[0].getNetworkLocation())) {
+            //No use continuing, unless a new rack in this case
+            return false;
+          }
         }
 
         // Add block to the to be replicated list
@@ -867,10 +898,13 @@ public class BlockManager {
       synchronized (namesystem) {
         for (int i = 0; i < timedOutItems.length; i++) {
           NumberReplicas num = countNodes(timedOutItems[i]);
-          neededReplications.add(timedOutItems[i],
-                                 num.liveReplicas(),
-                                 num.decommissionedReplicas(),
-                                 getReplication(timedOutItems[i]));
+          if (isNeededReplication(timedOutItems[i], getReplication(timedOutItems[i]),
+                                 num.liveReplicas())) {
+            neededReplications.add(timedOutItems[i],
+                                   num.liveReplicas(),
+                                   num.decommissionedReplicas(),
+                                   getReplication(timedOutItems[i]));
+          }
         }
       }
       /* If we know the target datanodes where the replication timedout,
@@ -1122,9 +1156,11 @@ public class BlockManager {
         NumberReplicas num = countNodes(block);
         int numCurrentReplica = num.liveReplicas();
         // add to under-replicated queue if need to be
-        if (neededReplications.add(block, numCurrentReplica, num
-            .decommissionedReplicas(), expectedReplication)) {
-          nrUnderReplicated++;
+        if (isNeededReplication(block, expectedReplication, numCurrentReplica)) {
+          if (neededReplications.add(block, numCurrentReplica, num
+              .decommissionedReplicas(), expectedReplication)) {
+            nrUnderReplicated++;
+          }
         }
 
         if (numCurrentReplica > expectedReplication) {
@@ -1303,8 +1339,11 @@ public class BlockManager {
         NumberReplicas num = countNodes(block);
         int curReplicas = num.liveReplicas();
         int curExpectedReplicas = getReplication(block);
-        if (curExpectedReplicas > curReplicas) {
-          status = true;
+        if (isNeededReplication(block, curExpectedReplicas, curReplicas)) {
+          if (curExpectedReplicas > curReplicas) {
+            //Set to true only if strictly under-replicated
+            status = true;
+          }
           if (!neededReplications.contains(block) &&
             pendingReplications.getNumReplicas(block) == 0) {
             //
@@ -1357,16 +1396,23 @@ public class BlockManager {
     synchronized (namesystem) {
       NumberReplicas repl = countNodes(block);
       int curExpectedReplicas = getReplication(block);
-      neededReplications.update(block, repl.liveReplicas(), repl
-          .decommissionedReplicas(), curExpectedReplicas, curReplicasDelta,
-          expectedReplicasDelta);
+      if (isNeededReplication(block, curExpectedReplicas, repl.liveReplicas())) {
+        neededReplications.update(block, repl.liveReplicas(), repl
+            .decommissionedReplicas(), curExpectedReplicas, curReplicasDelta,
+            expectedReplicasDelta);
+      } else {
+        int oldReplicas = repl.liveReplicas()-curReplicasDelta;
+        int oldExpectedReplicas = curExpectedReplicas-expectedReplicasDelta;
+        neededReplications.remove(block, oldReplicas, repl.decommissionedReplicas(),
+                                  oldExpectedReplicas);
+      }
     }
   }
 
   void checkReplication(Block block, int numExpectedReplicas) {
     // filter out containingNodes that are marked for decommission.
     NumberReplicas number = countNodes(block);
-    if (number.liveReplicas() < numExpectedReplicas) {
+    if (isNeededReplication(block, numExpectedReplicas, number.liveReplicas())) { 
       neededReplications.add(block,
                              number.liveReplicas(),
                              number.decommissionedReplicas,
@@ -1448,7 +1494,68 @@ public class BlockManager {
       return blocksToInvalidate.size();
     }
   }
+  
+  //Returns the number of racks over which a given block is replicated
+  //decommissioning/decommissioned nodes are not counted. corrupt replicas 
+  //are also ignored
+  int getNumberOfRacks(Block b) {
+    HashSet<String> rackSet = new HashSet<String>(0);
+    Collection<DatanodeDescriptor> corruptNodes = 
+                                  corruptReplicas.getNodes(b);
+    for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(b); 
+         it.hasNext();) {
+      DatanodeDescriptor cur = it.next();
+      if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
+        if ((corruptNodes == null ) || !corruptNodes.contains(cur)) {
+          String rackName = cur.getNetworkLocation();
+          if (!rackSet.contains(rackName)) {
+            rackSet.add(rackName);
+          }
+        }
+      }
+    }
+    return rackSet.size();
+  }
 
+  boolean blockHasEnoughRacks(Block b) {
+    if (!this.shouldCheckForEnoughRacks) {
+      return true;
+    }
+    boolean enoughRacks = false;;
+    Collection<DatanodeDescriptor> corruptNodes = 
+                                  corruptReplicas.getNodes(b);
+    int numExpectedReplicas = getReplication(b);
+    String rackName = null;
+    for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(b); 
+         it.hasNext();) {
+      DatanodeDescriptor cur = it.next();
+      if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
+        if ((corruptNodes == null ) || !corruptNodes.contains(cur)) {
+          if (numExpectedReplicas == 1) {
+            enoughRacks = true;
+            break;
+          }
+          String rackNameNew = cur.getNetworkLocation();
+          if (rackName == null) {
+            rackName = rackNameNew;
+          } else if (!rackName.equals(rackNameNew)) {
+            enoughRacks = true;
+            break;
+          }
+        }
+      }
+    }
+    return enoughRacks;
+  }
+
+  boolean isNeededReplication(Block b, int expectedReplication, int curReplicas) {
+    if ((curReplicas >= expectedReplication) && (blockHasEnoughRacks(b))) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+  
   long getMissingBlocksCount() {
     // not locking
     return Math.max(missingBlocksInPrevIter, missingBlocksInCurIter);
@@ -1483,4 +1590,26 @@ public class BlockManager {
   float getLoadFactor() {
     return blocksMap.getLoadFactor();
   }
+  
+  
+  /**
+   * Return a range of corrupt replica block ids. Up to numExpectedBlocks 
+   * blocks starting at the next block after startingBlockId are returned
+   * (fewer if numExpectedBlocks blocks are unavailable). If startingBlockId 
+   * is null, up to numExpectedBlocks blocks are returned from the beginning.
+   * If startingBlockId cannot be found, null is returned.
+   *
+   * @param numExpectedBlocks Number of block ids to return.
+   *  0 <= numExpectedBlocks <= 100
+   * @param startingBlockId Block id from which to start. If null, start at
+   *  beginning.
+   * @return Up to numExpectedBlocks blocks from startingBlockId if it exists
+   *
+   */
+  long[] getCorruptReplicaBlockIds(int numExpectedBlocks,
+                                   Long startingBlockId) {
+    return corruptReplicas.getCorruptReplicaBlockIds(numExpectedBlocks,
+                                                     startingBlockId);
+  }  
+  
 }

+ 56 - 1
src/java/org/apache/hadoop/hdfs/server/namenode/CorruptReplicasMap.java

@@ -33,7 +33,7 @@ import java.util.*;
 
 public class CorruptReplicasMap{
 
-  private Map<Block, Collection<DatanodeDescriptor>> corruptReplicasMap =
+  private SortedMap<Block, Collection<DatanodeDescriptor>> corruptReplicasMap =
     new TreeMap<Block, Collection<DatanodeDescriptor>>();
   
   /**
@@ -126,4 +126,59 @@ public class CorruptReplicasMap{
   public int size() {
     return corruptReplicasMap.size();
   }
+
+  /**
+   * Return a range of corrupt replica block ids. Up to numExpectedBlocks 
+   * blocks starting at the next block after startingBlockId are returned
+   * (fewer if numExpectedBlocks blocks are unavailable). If startingBlockId 
+   * is null, up to numExpectedBlocks blocks are returned from the beginning.
+   * If startingBlockId cannot be found, null is returned.
+   *
+   * @param numExpectedBlocks Number of block ids to return.
+   *  0 <= numExpectedBlocks <= 100
+   * @param startingBlockId Block id from which to start. If null, start at
+   *  beginning.
+   * @return Up to numExpectedBlocks blocks from startingBlockId if it exists
+   *
+   */
+  long[] getCorruptReplicaBlockIds(int numExpectedBlocks,
+                                   Long startingBlockId) {
+    if (numExpectedBlocks < 0 || numExpectedBlocks > 100) {
+      return null;
+    }
+    
+    Iterator<Block> blockIt = corruptReplicasMap.keySet().iterator();
+    
+    // if the starting block id was specified, iterate over keys until
+    // we find the matching block. If we find a matching block, break
+    // to leave the iterator on the next block after the specified block. 
+    if (startingBlockId != null) {
+      boolean isBlockFound = false;
+      while (blockIt.hasNext()) {
+        Block b = blockIt.next();
+        if (b.getBlockId() == startingBlockId) {
+          isBlockFound = true;
+          break; 
+        }
+      }
+      
+      if (!isBlockFound) {
+        return null;
+      }
+    }
+
+    ArrayList<Long> corruptReplicaBlockIds = new ArrayList<Long>();
+
+    // append up to numExpectedBlocks blockIds to our list
+    for(int i=0; i<numExpectedBlocks && blockIt.hasNext(); i++) {
+      corruptReplicaBlockIds.add(blockIt.next().getBlockId());
+    }
+    
+    long[] ret = new long[corruptReplicaBlockIds.size()];
+    for(int i=0; i<ret.length; i++) {
+      ret[i] = corruptReplicaBlockIds.get(i);
+    }
+    
+    return ret;
+  }  
 }

+ 21 - 0
src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -3743,4 +3743,25 @@ public class FSNamesystem implements FSConstants, FSNamesystemMBean {
   DatanodeDescriptor getDatanode(String nodeID) {
     return datanodeMap.get(nodeID);
   }
+
+  /**
+   * Return a range of corrupt replica block ids. Up to numExpectedBlocks 
+   * blocks starting at the next block after startingBlockId are returned
+   * (fewer if numExpectedBlocks blocks are unavailable). If startingBlockId 
+   * is null, up to numExpectedBlocks blocks are returned from the beginning.
+   * If startingBlockId cannot be found, null is returned.
+   *
+   * @param numExpectedBlocks Number of block ids to return.
+   *  0 <= numExpectedBlocks <= 100
+   * @param startingBlockId Block id from which to start. If null, start at
+   *  beginning.
+   * @return Up to numExpectedBlocks blocks from startingBlockId if it exists
+   *
+   */
+  long[] getCorruptReplicaBlockIds(int numExpectedBlocks,
+                                   Long startingBlockId) {  
+    return blockManager.getCorruptReplicaBlockIds(numExpectedBlocks,
+                                                  startingBlockId);
+  }
+
 }

+ 1 - 2
src/java/org/apache/hadoop/hdfs/server/namenode/ListPathsServlet.java

@@ -165,11 +165,10 @@ public class ListPathsServlet extends DfsServlet {
         }
         catch(RemoteException re) {re.writeXml(p, doc);}
       }
-    } finally {
       if (doc != null) {
         doc.endDocument();
       }
-
+    } finally {
       if (out != null) {
         out.close();
       }

+ 195 - 0
src/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java

@@ -28,6 +28,8 @@ import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
 import javax.servlet.jsp.JspWriter;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.FSConstants.UpgradeAction;
 import org.apache.hadoop.hdfs.server.common.JspHelper;
@@ -38,6 +40,8 @@ import org.apache.hadoop.util.ServletUtil;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.VersionInfo;
 
+import org.znerd.xmlenc.*;
+
 class NamenodeJspHelper {
   static String getSafeModeText(FSNamesystem fsn) {
     if (!fsn.isInSafeMode())
@@ -449,4 +453,195 @@ class NamenodeJspHelper {
       }
     }
   }
+  
+  // utility class used in block_info_xml.jsp
+  static class XMLBlockInfo {
+    final Block block;
+    final INodeFile inode;
+    final FSNamesystem fsn;
+    
+    public XMLBlockInfo(FSNamesystem fsn, Long blockId) {
+      this.fsn = fsn;
+      if (blockId == null) {
+        this.block = null;
+        this.inode = null;
+      } else {
+        this.block = new Block(blockId);
+        this.inode = fsn.blockManager.getINode(block);
+      }
+    }
+
+    private String getLocalParentDir(INode inode) {
+      StringBuilder pathBuf = new StringBuilder();
+      INode node = inode;
+      
+      // loop up to directory root, prepending each directory name to buffer
+      while ((node = node.getParent()) != null && node.getLocalName() != "") {
+        pathBuf.insert(0, '/').insert(0, node.getLocalName());
+      }
+
+      return pathBuf.toString();
+    }
+
+    public void toXML(XMLOutputter doc) throws IOException {
+      doc.startTag("block_info");
+      if (block == null) {
+        doc.startTag("error");
+        doc.pcdata("blockId must be a Long");
+        doc.endTag();
+      }else{
+        doc.startTag("block_id");
+        doc.pcdata(""+block.getBlockId());
+        doc.endTag();
+
+        doc.startTag("block_name");
+        doc.pcdata(block.getBlockName());
+        doc.endTag();
+
+        if (inode != null) {
+          doc.startTag("file");
+
+          doc.startTag("local_name");
+          doc.pcdata(inode.getLocalName());
+          doc.endTag();
+
+          doc.startTag("local_directory");
+          doc.pcdata(getLocalParentDir(inode));
+          doc.endTag();
+
+          doc.startTag("user_name");
+          doc.pcdata(inode.getUserName());
+          doc.endTag();
+
+          doc.startTag("group_name");
+          doc.pcdata(inode.getGroupName());
+          doc.endTag();
+
+          doc.startTag("is_directory");
+          doc.pcdata(""+inode.isDirectory());
+          doc.endTag();
+
+          doc.startTag("access_time");
+          doc.pcdata(""+inode.getAccessTime());
+          doc.endTag();
+
+          doc.startTag("is_under_construction");
+          doc.pcdata(""+inode.isUnderConstruction());
+          doc.endTag();
+
+          doc.startTag("ds_quota");
+          doc.pcdata(""+inode.getDsQuota());
+          doc.endTag();
+
+          doc.startTag("permission_status");
+          doc.pcdata(inode.getPermissionStatus().toString());
+          doc.endTag();
+
+          doc.startTag("replication");
+          doc.pcdata(""+inode.getReplication());
+          doc.endTag();
+
+          doc.startTag("disk_space_consumed");
+          doc.pcdata(""+inode.diskspaceConsumed());
+          doc.endTag();
+
+          doc.startTag("preferred_block_size");
+          doc.pcdata(""+inode.getPreferredBlockSize());
+          doc.endTag();
+
+          doc.endTag(); // </file>
+        } 
+
+        doc.startTag("replicas");
+       
+        if (fsn.blockManager.blocksMap.contains(block)) {
+          Iterator<DatanodeDescriptor> it =
+            fsn.blockManager.blocksMap.nodeIterator(block);
+
+          while (it.hasNext()) {
+            doc.startTag("replica");
+
+            DatanodeDescriptor dd = it.next();
+
+            doc.startTag("host_name");
+            doc.pcdata(dd.getHostName());
+            doc.endTag();
+
+            boolean isCorrupt = fsn.getCorruptReplicaBlockIds(0,
+                                  block.getBlockId()) != null;
+            
+            doc.startTag("is_corrupt");
+            doc.pcdata(""+isCorrupt);
+            doc.endTag();
+            
+            doc.endTag(); // </replica>
+          }
+
+        } 
+        doc.endTag(); // </replicas>
+                
+      }
+      
+      doc.endTag(); // </block_info>
+      
+    }
+  }
+  
+  // utility class used in corrupt_replicas_xml.jsp
+  static class XMLCorruptBlockInfo {
+    final FSNamesystem fsn;
+    final Configuration conf;
+    final Long startingBlockId;
+    final int numCorruptBlocks;
+    
+    public XMLCorruptBlockInfo(FSNamesystem fsn, Configuration conf,
+                               int numCorruptBlocks, Long startingBlockId) {
+      this.fsn = fsn;
+      this.conf = conf;
+      this.numCorruptBlocks = numCorruptBlocks;
+      this.startingBlockId = startingBlockId;
+    }
+
+
+    public void toXML(XMLOutputter doc) throws IOException {
+      
+      doc.startTag("corrupt_block_info");
+      
+      if (numCorruptBlocks < 0 || numCorruptBlocks > 100) {
+        doc.startTag("error");
+        doc.pcdata("numCorruptBlocks must be >= 0 and <= 100");
+        doc.endTag();
+      }
+      
+      doc.startTag("dfs_replication");
+      doc.pcdata(""+conf.getInt("dfs.replication", 3));
+      doc.endTag();
+      
+      doc.startTag("num_missing_blocks");
+      doc.pcdata(""+fsn.getMissingBlocksCount());
+      doc.endTag();
+      
+      doc.startTag("num_corrupt_replica_blocks");
+      doc.pcdata(""+fsn.getCorruptReplicaBlocks());
+      doc.endTag();
+     
+      doc.startTag("corrupt_replica_block_ids");
+      long[] corruptBlockIds
+        = fsn.getCorruptReplicaBlockIds(numCorruptBlocks,
+                                        startingBlockId);
+      if (corruptBlockIds != null) {
+        for (Long blockId: corruptBlockIds) {
+          doc.startTag("block_id");
+          doc.pcdata(""+blockId);
+          doc.endTag();
+        }
+      }
+      
+      doc.endTag(); // </corrupt_replica_block_ids>
+
+      doc.endTag(); // </corrupt_block_info>
+      
+      doc.getWriter().flush();
+    }
+  }    
 }

+ 7 - 5
src/java/org/apache/hadoop/hdfs/server/namenode/UnderReplicatedBlocks.java

@@ -26,7 +26,7 @@ import org.apache.hadoop.hdfs.protocol.Block;
  * Blocks have only one replicas has the highest
  */
 class UnderReplicatedBlocks implements Iterable<Block> {
-  static final int LEVEL = 3;
+  static final int LEVEL = 4;
   private List<TreeSet<Block>> priorityQueues = new ArrayList<TreeSet<Block>>();
       
   /* constructor */
@@ -53,7 +53,7 @@ class UnderReplicatedBlocks implements Iterable<Block> {
     }
     return size;
   }
-        
+
   /* Check if a block is in the neededReplication queue */
   synchronized boolean contains(Block block) {
     for(TreeSet<Block> set:priorityQueues) {
@@ -71,8 +71,10 @@ class UnderReplicatedBlocks implements Iterable<Block> {
                           int curReplicas, 
                           int decommissionedReplicas,
                           int expectedReplicas) {
-    if (curReplicas<0 || curReplicas>=expectedReplicas) {
-      return LEVEL; // no need to replicate
+    if (curReplicas<0) {
+      return LEVEL;
+    } else if (curReplicas>=expectedReplicas) {
+      return 3; // Block doesn't have enough racks
     } else if(curReplicas==0) {
       // If there are zero non-decommissioned replica but there are
       // some decommissioned replicas, then assign them highest priority
@@ -99,7 +101,7 @@ class UnderReplicatedBlocks implements Iterable<Block> {
                            int curReplicas, 
                            int decomissionedReplicas,
                            int expectedReplicas) {
-    if(curReplicas<0 || expectedReplicas <= curReplicas) {
+    if(curReplicas<0) {
       return false;
     }
     int priLevel = getPriority(block, curReplicas, decomissionedReplicas,

+ 134 - 0
src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestBlocksWithNotEnoughRacks.java

@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs.server.namenode;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FsShell;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem.NumberReplicas;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.log4j.Level;
+
+import junit.framework.TestCase;
+
+public class TestBlocksWithNotEnoughRacks extends TestCase {
+
+  static {
+    ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL) ;
+  }
+
+  //Creates a block with all datanodes on same rack
+  //Adds additional datanode on a different rack
+  //The block should be replicated to the new rack
+  public void testSufficientlyReplicatedBlocksWithNotEnoughRacks() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setLong("dfs.heartbeat.interval", 1L);
+    conf.setInt("dfs.replication.interval", 1);
+    conf.set("topology.script.file.name", "xyz");
+    final short REPLICATION_FACTOR = 3;
+    final String FILE_NAME = "/testFile";
+    final Path FILE_PATH = new Path(FILE_NAME);
+    //All datanodes are on the same rack
+    String racks[] = {"/rack1","/rack1","/rack1",} ;
+    MiniDFSCluster cluster = new MiniDFSCluster(conf, REPLICATION_FACTOR, true, racks);
+    try {
+      // create a file with one block with a replication factor of 3
+      final FileSystem fs = cluster.getFileSystem();
+      DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
+      DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
+      
+      Block b = DFSTestUtil.getFirstBlock(fs, FILE_PATH);
+      final FSNamesystem namesystem = cluster.getNamesystem();
+      int numRacks = namesystem.blockManager.getNumberOfRacks(b);
+      
+      //Add a new datanode on a different rack
+      String newRacks[] = {"/rack2"} ;
+      cluster.startDataNodes(conf, 1, true, null, newRacks);
+      
+      Thread.sleep(5000);
+            
+      numRacks = namesystem.blockManager.getNumberOfRacks(b);
+      NumberReplicas number = namesystem.blockManager.countNodes(b);
+      int curReplicas = number.liveReplicas();
+
+      System.out.println("curReplicas = " + curReplicas);
+      System.out.println("numRacks = " + numRacks);
+      System.out.println("Size = " + namesystem.blockManager.neededReplications.size());
+
+      assertEquals(2,numRacks);
+      assertTrue(curReplicas == REPLICATION_FACTOR);
+      assertEquals(0,namesystem.blockManager.neededReplications.size());
+    } finally {
+      cluster.shutdown();
+    }
+    
+  }
+
+  public void testUnderReplicatedNotEnoughRacks() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setLong("dfs.heartbeat.interval", 1L);
+    conf.setInt("dfs.replication.interval", 1);
+    conf.setInt("dfs.replication.pending.timeout.sec", 1);
+    conf.set("topology.script.file.name", "xyz");
+    short REPLICATION_FACTOR = 3;
+    final String FILE_NAME = "/testFile";
+    final Path FILE_PATH = new Path(FILE_NAME);
+    //All datanodes are on the same rack
+    String racks[] = {"/rack1","/rack1","/rack1",} ;
+    MiniDFSCluster cluster = new MiniDFSCluster(conf, REPLICATION_FACTOR, true, racks);
+    try {
+      // create a file with one block with a replication factor of 3
+      final FileSystem fs = cluster.getFileSystem();
+      DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
+      DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
+      
+      Block b = DFSTestUtil.getFirstBlock(fs, FILE_PATH);
+      final FSNamesystem namesystem = cluster.getNamesystem();
+      int numRacks = namesystem.blockManager.getNumberOfRacks(b);
+      
+      //Add a new datanode on a different rack
+      String newRacks[] = {"/rack2","/rack2"} ;
+      cluster.startDataNodes(conf, 2, true, null, newRacks);
+      REPLICATION_FACTOR = 5;
+      namesystem.setReplication(FILE_NAME, REPLICATION_FACTOR); 
+      Thread.sleep(30000);
+            
+
+      numRacks = namesystem.blockManager.getNumberOfRacks(b);
+      NumberReplicas number = namesystem.blockManager.countNodes(b);
+      int curReplicas = number.liveReplicas();
+
+      System.out.println("curReplicas = " + curReplicas);
+      System.out.println("numRacks = " + numRacks);
+      System.out.println("Size = " + namesystem.blockManager.neededReplications.size());
+
+      assertEquals(2,numRacks);
+      assertTrue(curReplicas == REPLICATION_FACTOR);
+      assertEquals(0,namesystem.blockManager.neededReplications.size());
+    } finally {
+      cluster.shutdown();
+    }
+    
+  }
+}

+ 122 - 0
src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCorruptReplicaInfo.java

@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.IOException;
+import java.util.*;
+import junit.framework.TestCase;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.server.namenode.CorruptReplicasMap;
+import org.apache.hadoop.hdfs.server.namenode.DatanodeDescriptor;
+
+
+/**
+ * This test makes sure that 
+ *   CorruptReplicasMap::numBlocksWithCorruptReplicas and
+ *   CorruptReplicasMap::getCorruptReplicaBlockIds
+ *   return the correct values
+ */
+public class TestCorruptReplicaInfo extends TestCase {
+  
+  private static final Log LOG = 
+                           LogFactory.getLog(TestCorruptReplicaInfo.class);
+  
+  private Map<Long, Block> block_map =
+    new HashMap<Long, Block>();  
+    
+  // Allow easy block creation by block id
+  // Return existing block if one with same block id already exists
+  private Block getBlock(Long block_id) {
+    if (!block_map.containsKey(block_id)) {
+      block_map.put(block_id, new Block(block_id,0,0));
+    }
+    
+    return block_map.get(block_id);
+  }
+  
+  private Block getBlock(int block_id) {
+    return getBlock((long)block_id);
+  }
+  
+  public void testCorruptReplicaInfo() throws IOException, 
+                                       InterruptedException {
+    
+      CorruptReplicasMap crm = new CorruptReplicasMap();
+      
+      // Make sure initial values are returned correctly
+      assertEquals("Number of corrupt blocks must initially be 0", 0, crm.size());
+      assertNull("Param n cannot be less than 0", crm.getCorruptReplicaBlockIds(-1, null));
+      assertNull("Param n cannot be greater than 100", crm.getCorruptReplicaBlockIds(101, null));
+      long[] l = crm.getCorruptReplicaBlockIds(0, null);
+      assertNotNull("n = 0 must return non-null", l);
+      assertEquals("n = 0 must return an empty list", 0, l.length);
+
+      // create a list of block_ids. A list is used to allow easy validation of the
+      // output of getCorruptReplicaBlockIds
+      int NUM_BLOCK_IDS = 140;
+      List<Long> block_ids = new LinkedList<Long>();
+      for (int i=0;i<NUM_BLOCK_IDS;i++) {
+        block_ids.add((long)i);
+      }
+      
+      DatanodeDescriptor dn1 = new DatanodeDescriptor();
+      DatanodeDescriptor dn2 = new DatanodeDescriptor();
+      DatanodeDescriptor dn3 = new DatanodeDescriptor();
+      
+      crm.addToCorruptReplicasMap(getBlock(0), dn1);
+      assertEquals("Number of corrupt blocks not returning correctly",
+                   1, crm.size());
+      crm.addToCorruptReplicasMap(getBlock(1), dn1);
+      assertEquals("Number of corrupt blocks not returning correctly",
+                   2, crm.size());
+      
+      crm.addToCorruptReplicasMap(getBlock(1), dn2);
+      assertEquals("Number of corrupt blocks not returning correctly",
+                   2, crm.size());
+      
+      crm.removeFromCorruptReplicasMap(getBlock(1));
+      assertEquals("Number of corrupt blocks not returning correctly",
+                   1, crm.size());
+      
+      crm.removeFromCorruptReplicasMap(getBlock(0));
+      assertEquals("Number of corrupt blocks not returning correctly",
+                   0, crm.size());
+      
+      for (Long block_id: block_ids) {
+        crm.addToCorruptReplicasMap(getBlock(block_id), dn1);
+      }
+            
+      assertEquals("Number of corrupt blocks not returning correctly",
+                   NUM_BLOCK_IDS, crm.size());
+      
+      assertTrue("First five block ids not returned correctly ",
+                Arrays.equals(new long[]{0,1,2,3,4},
+                              crm.getCorruptReplicaBlockIds(5, null)));
+                              
+      LOG.info(crm.getCorruptReplicaBlockIds(10, 7L));
+      LOG.info(block_ids.subList(7, 18));
+
+      assertTrue("10 blocks after 7 not returned correctly ",
+                Arrays.equals(new long[]{8,9,10,11,12,13,14,15,16,17},
+                              crm.getCorruptReplicaBlockIds(10, 7L)));
+      
+  }
+}

+ 13 - 0
src/test/mapred-site.xml

@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put mapreduce site-specific property overrides in this file. -->
+
+<configuration>
+
+<property>
+  <name>mapred.job.tracker.retire.jobs</name>
+  <value>false</value>
+  <description></description>
+</property>
+</configuration>

+ 97 - 0
src/webapps/hdfs/block_info_xml.jsp

@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="UTF-8"?><%!
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file 
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+ /*
+ 
+  This script outputs information about a block (as XML). The script accepts a 
+  GET parameter named blockId which should be block id (as a long).
+
+  Example output is below (the blockId was 8888705098093096373):
+    <block_info>
+      <block_id>8888705098093096373</block_id>
+      <block_name>blk_8888705098093096373</block_name>
+      <file>
+        <local_name>some_file_name</local_name>
+        <local_directory>/input/</local_directory>
+        <user_name>user_name</user_name>
+        <group_name>supergroup</group_name>
+        <is_directory>false</is_directory>
+        <access_time>1251166313680</access_time>
+        <is_under_construction>false</is_under_construction>
+        <ds_quota>-1</ds_quota>
+        <permission_status>user_name:supergroup:rw-r--r--</permission_status>
+        <replication>1</replication>
+        <disk_space_consumed>2815</disk_space_consumed>
+        <preferred_block_size>67108864</preferred_block_size>
+      </file>
+      <replicas>
+        <replica>
+          <host_name>hostname</host_name>
+          <is_corrupt>false</is_corrupt>
+        </replica>
+      </replicas>
+    </block_info> 
+
+  Notes:
+    - block_info/file will only exist if the file can be found
+    - block_info/replicas can contain 0 or more children 
+    - If an error exists, block_info/error will exist and contain a human
+      readable error message
+ 
+*/
+ 
+%>
+<%@ page
+  contentType="application/xml"
+  import="java.io.IOException"
+  import="java.util.Iterator"
+  import="org.apache.hadoop.conf.Configuration"
+  import="org.apache.hadoop.hdfs.protocol.Block"
+  import="org.apache.hadoop.hdfs.server.namenode.INode"
+  import="org.apache.hadoop.hdfs.server.namenode.BlocksMap"
+  import="org.apache.hadoop.hdfs.server.namenode.BlockInfo"
+  import="org.apache.hadoop.hdfs.server.namenode.DatanodeDescriptor"
+  import="org.apache.hadoop.hdfs.server.namenode.NamenodeJspHelper.XMLBlockInfo"
+  import="org.apache.hadoop.hdfs.server.common.JspHelper"
+  import="org.apache.hadoop.util.ServletUtil"
+  import="org.znerd.xmlenc.*"
+  
+%>
+<%!
+  //for java.io.Serializable
+  private static final long serialVersionUID = 1L;  
+%>
+<%
+NameNode nn = (NameNode)application.getAttribute("name.node");
+String namenodeRole = nn.getRole().toString();
+FSNamesystem fsn = nn.getNamesystem();
+
+Long blockId = null;
+try {
+  blockId = JspHelper.validateLong(request.getParameter("blockId"));
+} catch(NumberFormatException e) {
+  blockId = null;
+}
+
+
+XMLBlockInfo bi = new XMLBlockInfo(fsn, blockId);
+XMLOutputter doc = new XMLOutputter(out, "UTF-8");
+bi.toXML(doc);
+
+%>

+ 91 - 0
src/webapps/hdfs/corrupt_replicas_xml.jsp

@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="UTF-8"?><%!
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file 
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+ /*
+ 
+  This script outputs information about corrupt replicas on the system (as XML). 
+  
+  The script takes two GET parameters:
+    - numCorruptBlocks The number of corrupt blocks to return. Must be >= 0 &&
+      <= 100. Defaults to 10.
+    - startingBlockId The block id (as a long) from which to begin iterating. 
+      Output does not include the starting block id (it begins at the following
+      block id). If not given, iteration starts from beginning. 
+
+  Example output is below:
+      <corrupt_block_info>
+        <dfs_replication>1</dfs_replication>
+        <num_missing_blocks>1</num_missing_blocks>
+        <num_corrupt_replica_blocks>1</num_corrupt_replica_blocks>
+        <corrupt_replica_block_ids>
+          <block_id>-2207002825050436217</block_id>
+        </corrupt_replica_block_ids>
+      </corrupt_block_info>
+
+  Notes:
+    - corrupt_block_info/corrupt_replica_block_ids will 0 to numCorruptBlocks
+      children
+    - If an error exists, corrupt_block_info/error will exist and
+      contain a human readable error message
+ 
+*/
+ 
+%>
+<%@ page
+  contentType="application/xml"
+  import="java.io.IOException"
+  import="java.util.List"
+  import="org.apache.hadoop.conf.Configuration"
+  import="org.apache.hadoop.hdfs.server.common.JspHelper"
+  import="org.apache.hadoop.hdfs.server.namenode.NamenodeJspHelper.XMLCorruptBlockInfo"
+  import="org.apache.hadoop.util.ServletUtil"
+  import="org.znerd.xmlenc.*"
+%>
+<%!
+  private static final long serialVersionUID = 1L;
+%>
+<%
+
+  NameNode nn = (NameNode)application.getAttribute("name.node");
+  FSNamesystem fsn = nn.getNamesystem();
+
+  Integer numCorruptBlocks = 10;
+  try {
+    Long l = JspHelper.validateLong(request.getParameter("numCorruptBlocks"));
+    if (l != null) {
+      numCorruptBlocks = l.intValue();
+    }
+  } catch(NumberFormatException e) {
+    
+  }
+
+  Long startingBlockId = null;
+  try {
+    startingBlockId =
+      JspHelper.validateLong(request.getParameter("startingBlockId"));
+  } catch(NumberFormatException e) { 
+  }  
+
+  XMLCorruptBlockInfo cbi = new XMLCorruptBlockInfo(fsn,
+                                                    new Configuration(),
+                                                    numCorruptBlocks,
+                                                    startingBlockId);
+  XMLOutputter doc = new XMLOutputter(out, "UTF-8");
+  cbi.toXML(doc);
+%>