Explorar o código

HDFS-8025. Addendum fix for HDFS-3087 Decomissioning on NN restart can complete without blocks being replicated. Contributed by Ming Ma.
(cherry picked from commit 5a540c3d3107199f4632e2ad7ee8ff913b107a04)

Conflicts:
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

Kihwal Lee %!s(int64=10) %!d(string=hai) anos
pai
achega
9a111fcd1d

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -971,6 +971,9 @@ Release 2.7.0 - UNRELEASED
     HDFS-8072. Reserved RBW space is not released if client terminates while
     writing block. (Arpit Agarwal)
 
+    HDFS-8025. Addendum fix for HDFS-3087 Decomissioning on NN restart can
+    complete without blocks being replicated. (Ming Ma via wang)
+
     BREAKDOWN OF HDFS-7584 SUBTASKS AND RELATED JIRAS
 
       HDFS-7720. Quota by Storage Type API, tools and ClientNameNode

+ 5 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

@@ -3308,6 +3308,11 @@ public class BlockManager {
    * liveness. Dead nodes cannot always be safely decommissioned.
    */
   boolean isNodeHealthyForDecommission(DatanodeDescriptor node) {
+    if (!node.checkBlockReportReceived()) {
+      LOG.info("Node {} hasn't sent its first block report.", node);
+      return false;
+    }
+
     if (node.isAlive) {
       return true;
     }

+ 12 - 20
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java

@@ -882,9 +882,12 @@ public class TestDecommission {
     int numNamenodes = 1;
     int numDatanodes = 1;
     int replicas = 1;
-    
+    conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY,
+        DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_DEFAULT);
+    conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INITIAL_DELAY_KEY, 5);
+
     startCluster(numNamenodes, numDatanodes, conf);
-    Path file1 = new Path("testDecommission.dat");
+    Path file1 = new Path("testDecommissionWithNamenodeRestart.dat");
     FileSystem fileSys = cluster.getFileSystem();
     writeFile(fileSys, file1, replicas);
         
@@ -894,37 +897,26 @@ public class TestDecommission {
     String excludedDatanodeName = info[0].getXferAddr();
 
     writeConfigFile(excludeFile, new ArrayList<String>(Arrays.asList(excludedDatanodeName)));
-    
+
     //Add a new datanode to cluster
     cluster.startDataNodes(conf, 1, true, null, null, null, null);
     numDatanodes+=1;
-    
+
     assertEquals("Number of datanodes should be 2 ", 2, cluster.getDataNodes().size());
     //Restart the namenode
     cluster.restartNameNode();
     DatanodeInfo datanodeInfo = NameNodeAdapter.getDatanode(
         cluster.getNamesystem(), excludedDatanodeID);
     waitNodeState(datanodeInfo, AdminStates.DECOMMISSIONED);
-    
+
     // Ensure decommissioned datanode is not automatically shutdown
     assertEquals("All datanodes must be alive", numDatanodes, 
         client.datanodeReport(DatanodeReportType.LIVE).length);
-    // wait for the block to be replicated
-    int tries = 0;
-    while (tries++ < 20) {
-      try {
-        Thread.sleep(1000);
-        if (checkFile(fileSys, file1, replicas, datanodeInfo.getXferAddr(),
-            numDatanodes) == null) {
-          break;
-        }
-      } catch (InterruptedException ie) {
-      }
-    }
-    assertTrue("Checked if block was replicated after decommission, tried "
-        + tries + " times.", tries < 20);
-    cleanupFile(fileSys, file1);
+    assertTrue("Checked if block was replicated after decommission.",
+        checkFile(fileSys, file1, replicas, datanodeInfo.getXferAddr(),
+        numDatanodes) == null);
 
+    cleanupFile(fileSys, file1);
     // Restart the cluster and ensure recommissioned datanodes
     // are allowed to register with the namenode
     cluster.shutdown();