Просмотр исходного кода

HDFS-4261. Fix bugs in Balaner causing infinite loop and TestBalancerWithNodeGroup timeing out. Contributed by Junping Du

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1@1488865 13f79535-47bb-0310-9956-ffa450edef68
Tsz-wo Sze 12 лет назад
Родитель
Сommit
59643d1679

+ 3 - 0
CHANGES.txt

@@ -58,6 +58,9 @@ Release 1.3.0 - unreleased
     HADOOP-8981. TestMetricsSystemImpl fails on Windows. (Xuan Gong, backported
     HADOOP-8981. TestMetricsSystemImpl fails on Windows. (Xuan Gong, backported
     by Chris Nauroth via suresh)
     by Chris Nauroth via suresh)
 
 
+    HDFS-4261. Fix bugs in Balaner causing infinite loop and
+    TestBalancerWithNodeGroup timeing out.  (Junping Du via szetszwo)
+
 Release 1.2.1 - Unreleased 
 Release 1.2.1 - Unreleased 
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 16 - 4
src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java

@@ -193,6 +193,8 @@ public class Balancer implements Tool {
    */
    */
   public static final int MAX_NUM_CONCURRENT_MOVES = 5;
   public static final int MAX_NUM_CONCURRENT_MOVES = 5;
   
   
+  public static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5;
+  
   private Configuration conf;
   private Configuration conf;
 
 
   private double threshold = 10D;
   private double threshold = 10D;
@@ -746,6 +748,7 @@ public class Balancer implements Tool {
       long startTime = Util.now();
       long startTime = Util.now();
       this.blocksToReceive = 2*scheduledSize;
       this.blocksToReceive = 2*scheduledSize;
       boolean isTimeUp = false;
       boolean isTimeUp = false;
+      int noPendingBlockIteration = 0;
       while(!isTimeUp && scheduledSize > 0 &&
       while(!isTimeUp && scheduledSize > 0 &&
           (!srcBlockList.isEmpty() || blocksToReceive > 0)) {
           (!srcBlockList.isEmpty() || blocksToReceive > 0)) {
         PendingBlockMove pendingBlock = chooseNextBlockToMove();
         PendingBlockMove pendingBlock = chooseNextBlockToMove();
@@ -769,7 +772,15 @@ public class Balancer implements Tool {
             LOG.warn(StringUtils.stringifyException(e));
             LOG.warn(StringUtils.stringifyException(e));
             return;
             return;
           }
           }
-        } 
+        } else {
+          // source node cannot find a pendingBlockToMove, iteration +1
+          noPendingBlockIteration++;
+          // in case no blocks can be moved for source node's task,
+          // jump out of while-loop after 5 iterations.
+          if (noPendingBlockIteration >= MAX_NO_PENDING_BLOCK_ITERATIONS) {
+            scheduledSize = 0;
+          }
+        }
         
         
         // check if time is up or not
         // check if time is up or not
         if (Util.now()-startTime > MAX_ITERATION_TIME) {
         if (Util.now()-startTime > MAX_ITERATION_TIME) {
@@ -1496,7 +1507,11 @@ public class Balancer implements Tool {
       Formatter formatter = new Formatter(System.out);
       Formatter formatter = new Formatter(System.out);
       System.out.println("Time Stamp               Iteration#  Bytes Already Moved  Bytes Left To Move  Bytes Being Moved");
       System.out.println("Time Stamp               Iteration#  Bytes Already Moved  Bytes Left To Move  Bytes Being Moved");
       int iterations = 0;
       int iterations = 0;
+      
       while (true) {
       while (true) {
+        // clean all lists at the beginning of balancer iteration.
+        resetData();
+
         /* get all live datanodes of a cluster and their disk usage
         /* get all live datanodes of a cluster and their disk usage
          * decide the number of bytes need to be moved
          * decide the number of bytes need to be moved
          */
          */
@@ -1547,9 +1562,6 @@ public class Balancer implements Tool {
             return NO_MOVE_PROGRESS;
             return NO_MOVE_PROGRESS;
           }
           }
         }
         }
-
-        // clean all lists
-        resetData();
         
         
         try {
         try {
           Thread.sleep(2*conf.getLong("dfs.heartbeat.interval", 3));
           Thread.sleep(2*conf.getLong("dfs.heartbeat.interval", 3));

+ 4 - 4
src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java

@@ -216,7 +216,7 @@ public class TestBalancerWithNodeGroup {
    * to n0 or n1 as balancer policy with node group. Thus, we expect the balancer
    * to n0 or n1 as balancer policy with node group. Thus, we expect the balancer
    * to end in 5 iterations without move block process.
    * to end in 5 iterations without move block process.
    */
    */
-  @Test
+  @Test(timeout=60000)
   public void testBalancerEndInNoMoveProgress() throws Exception {
   public void testBalancerEndInNoMoveProgress() throws Exception {
     Configuration conf = createConf();
     Configuration conf = createConf();
     long[] capacities = new long[]{CAPACITY, CAPACITY, CAPACITY, CAPACITY};
     long[] capacities = new long[]{CAPACITY, CAPACITY, CAPACITY, CAPACITY};
@@ -255,7 +255,7 @@ public class TestBalancerWithNodeGroup {
    * Create a cluster with even distribution, and a new empty node is added to
    * Create a cluster with even distribution, and a new empty node is added to
    * the cluster, then test rack locality for balancer policy. 
    * the cluster, then test rack locality for balancer policy. 
    */
    */
-  @Test
+  @Test(timeout=60000)
   public void testBalancerWithRackLocality() throws Exception {
   public void testBalancerWithRackLocality() throws Exception {
     Configuration conf = createConf();
     Configuration conf = createConf();
     long[] capacities = new long[]{CAPACITY, CAPACITY};
     long[] capacities = new long[]{CAPACITY, CAPACITY};
@@ -294,7 +294,7 @@ public class TestBalancerWithNodeGroup {
       totalCapacity += newCapacity;
       totalCapacity += newCapacity;
 
 
       // run balancer and validate results
       // run balancer and validate results
-      runBalancer(conf, totalUsedSpace, totalCapacity);
+      runBalancerCanFinish(conf, totalUsedSpace, totalCapacity);
       
       
       DatanodeInfo[] datanodeReport = 
       DatanodeInfo[] datanodeReport = 
               client.getDatanodeReport(DatanodeReportType.ALL);
               client.getDatanodeReport(DatanodeReportType.ALL);
@@ -321,7 +321,7 @@ public class TestBalancerWithNodeGroup {
   /** Create a cluster with even distribution, and a new empty node is added to
   /** Create a cluster with even distribution, and a new empty node is added to
    *  the cluster, then test rack locality for balancer policy. 
    *  the cluster, then test rack locality for balancer policy. 
    **/
    **/
-  @Test
+  @Test(timeout=60000)
   public void testBalancerWithNodeGroup() throws Exception {
   public void testBalancerWithNodeGroup() throws Exception {
     Configuration conf = createConf();
     Configuration conf = createConf();
     long[] capacities = new long[]{CAPACITY, CAPACITY};
     long[] capacities = new long[]{CAPACITY, CAPACITY};