Browse Source

HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. Contributed by Hari Mankude and Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1229898 13f79535-47bb-0310-9956-ffa450edef68
Todd Lipcon 13 years ago
parent
commit
190dc1c91b

+ 2 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt

@@ -93,3 +93,5 @@ HDFS-2730. Refactor shared HA-related test code into HATestUtil class (todd)
 HDFS-2762. Fix TestCheckpoint timing out on HA branch. (Uma Maheswara Rao G via todd)
 
 HDFS-2724. NN web UI can throw NPE after startup, before standby state is entered. (todd)
+
+HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. (Hari Mankude and todd via todd)

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

@@ -1361,7 +1361,7 @@ public class BlockManager {
 
       // To minimize startup time, we discard any second (or later) block reports
       // that we receive while still in startup phase.
-      if (namesystem.isInStartupSafeMode() && node.numBlocks() > 0) {
+      if (namesystem.isInStartupSafeMode() && !node.isFirstBlockReport()) {
         NameNode.stateChangeLog.info("BLOCK* processReport: "
             + "discarded non-initial block report from " + nodeID.getName()
             + " because namenode still in startup phase");

+ 9 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java

@@ -151,6 +151,10 @@ public class DatanodeDescriptor extends DatanodeInfo {
   private long lastBlocksScheduledRollTime = 0;
   private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
   private int volumeFailures = 0;
+  
+  /** Set to false after processing first block report */
+  private boolean firstBlockReport = true;
+  
   /** 
    * When set to true, the node is not in include list and is not allowed
    * to communicate with the namenode
@@ -608,6 +612,11 @@ public class DatanodeDescriptor extends DatanodeInfo {
     if (heartbeatedSinceFailover) {
       blockContentsStale = false;
     }
+    firstBlockReport = false;
+  }
+  
+  boolean isFirstBlockReport() {
+    return firstBlockReport;
   }
 
   @Override

+ 34 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java

@@ -192,7 +192,7 @@ public class TestHASafeMode {
    *    knows there should only be 90 blocks, but it's still in safemode.
    * 8. NN2 doesn't ever recheck whether it should leave safemode.
    * 
-   * This is essentially the inverse of {@link #testBlocksAddedWhileStandbyShutdown()}
+   * This is essentially the inverse of {@link #testBlocksAddedBeforeStandbyRestart()}
    */
   @Test
   public void testBlocksRemovedBeforeStandbyRestart() throws Exception {
@@ -328,6 +328,39 @@ public class TestHASafeMode {
             "total blocks 5. Safe mode will be turned off automatically"));
   }
   
+  /**
+   * Regression test for HDFS-2753. In this bug, the following sequence was
+   * observed:
+   * - Some blocks are written to DNs while the SBN was down. This causes
+   *   the blockReceived messages to get queued in the BPServiceActor on the
+   *   DN.
+   * - When the SBN returns, the DN re-registers with the SBN, and then
+   *   flushes its blockReceived queue to the SBN before it sends its
+   *   first block report. This caused the first block report to be
+   *   incorrect ignored.
+   * - The SBN would become stuck in safemode.
+   */
+  @Test
+  public void testBlocksAddedWhileStandbyIsDown() throws Exception {
+    DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L);
+
+    banner("Stopping standby");
+    cluster.shutdownNameNode(1);
+    
+    DFSTestUtil.createFile(fs, new Path("/test2"), 3*BLOCK_SIZE, (short) 3, 1L);
+
+    banner("Rolling edit log so standby gets all edits on restart");
+    nn0.getRpcServer().rollEditLog();
+    
+    restartStandby();
+    String status = nn1.getNamesystem().getSafemode();
+    assertTrue("Bad safemode status: '" + status + "'",
+        status.startsWith(
+            "Safe mode is ON." +
+            "The reported blocks 6 has reached the threshold 0.9990 of " +
+            "total blocks 6. Safe mode will be turned off automatically"));    
+  }
+  
   /**
    * Print a big banner in the test log to make debug easier.
    */