1
0
Selaa lähdekoodia

HDFS-2914. HA: Standby should not enter safemode when resources are low. Contributed by Vinay.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1347895 13f79535-47bb-0310-9956-ffa450edef68
Aaron Myers 13 vuotta sitten
vanhempi
commit
b98e26af58

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -300,6 +300,9 @@ Branch-2 ( Unreleased changes )
     HDFS-3485. DataTransferThrottler will over-throttle when currentTimeMillis
     jumps (Andy Isaacson via todd)
 
+    HDFS-2914. HA: Standby should not enter safemode when resources are low.
+    (Vinay via atm)
+
   BREAKDOWN OF HDFS-3042 SUBTASKS
 
     HDFS-2185. HDFS portion of ZK-based FailoverController (todd)

+ 15 - 5
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -572,8 +572,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
         !safeMode.isPopulatingReplQueues();
       setBlockTotal();
       blockManager.activate(conf);
-      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
-      nnrmthread.start();
     } finally {
       writeUnlock();
     }
@@ -590,7 +588,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     writeLock();
     try {
       if (blockManager != null) blockManager.close();
-      if (nnrmthread != null) nnrmthread.interrupt();
     } finally {
       writeUnlock();
     }
@@ -644,6 +641,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       }
       leaseManager.startMonitor();
       startSecretManagerIfNecessary();
+
+      //ResourceMonitor required only at ActiveNN. See HDFS-2914
+      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
+      nnrmthread.start();
     } finally {
       writeUnlock();
     }
@@ -666,6 +667,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       if (leaseManager != null) {
         leaseManager.stopMonitor();
       }
+      if (nnrmthread != null) {
+        ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
+        nnrmthread.interrupt();
+      }
       if (dir != null && dir.fsImage != null) {
         if (dir.fsImage.editLog != null) {
           dir.fsImage.editLog.close();
@@ -3193,10 +3198,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    * acceptable levels, this daemon will cause the NN to exit safe mode.
    */
   class NameNodeResourceMonitor implements Runnable  {
+    boolean shouldNNRmRun = true;
     @Override
     public void run () {
       try {
-        while (fsRunning) {
+        while (fsRunning && shouldNNRmRun) {
           checkAvailableResources();
           if(!nameNodeHasResourcesAvailable()) {
             String lowResourcesMsg = "NameNode low on available disk space. ";
@@ -3217,7 +3223,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
         FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
       }
     }
-  }
+
+    public void stopMonitor() {
+      shouldNNRmRun = false;
+    }
+ }
   
   public FSImage getFSImage() {
     return dir.fsImage;

+ 12 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java

@@ -17,6 +17,8 @@
  */
 package org.apache.hadoop.hdfs.server.namenode.ha;
 
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
 import static org.junit.Assert.*;
 
 import java.io.File;
@@ -127,6 +129,7 @@ public class TestFailureOfSharedDir {
   @Test
   public void testFailureOfSharedDir() throws Exception {
     Configuration conf = new Configuration();
+    conf.setLong(DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY, 2000);
     
     // The shared edits dir will automatically be marked required.
     MiniDFSCluster cluster = null;
@@ -151,6 +154,15 @@ public class TestFailureOfSharedDir {
       assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w",
           true));
 
+      Thread.sleep(conf.getLong(DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
+          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT) * 2);
+
+      NameNode nn1 = cluster.getNameNode(1);
+      assertTrue(nn1.isStandbyState());
+      assertFalse(
+          "StandBy NameNode should not go to SafeMode on resource unavailability",
+          nn1.isInSafeMode());
+
       NameNode nn0 = cluster.getNameNode(0);
       nn0.getNamesystem().getFSImage().getEditLog().getJournalSet()
           .setRuntimeForTesting(mockRuntime);