ソースを参照

HDFS-5140. Merge change r1518899 from trunk.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1518902 13f79535-47bb-0310-9956-ffa450edef68
Jing Zhao 11 年 前
コミット
9a55c1dc66

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -181,6 +181,9 @@ Release 2.1.1-beta - UNRELEASED
     HDFS-5077. NPE in FSNamesystem.commitBlockSynchronization().
     (Plamen Jeliazkov via shv)
 
+    HDFS-5140. Too many safemode monitor threads being created in the standby 
+    namenode causing it to fail with out of memory error. (jing9)
+
 Release 2.1.0-beta - 2013-08-22
 
   INCOMPATIBLE CHANGES

+ 11 - 7
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -351,7 +351,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
 
   final LeaseManager leaseManager = new LeaseManager(this); 
 
-  Daemon smmthread = null;  // SafeModeMonitor thread
+  volatile Daemon smmthread = null;  // SafeModeMonitor thread
   
   Daemon nnrmthread = null; // NamenodeResourceMonitor thread
 
@@ -4538,7 +4538,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       // Have to have write-lock since leaving safemode initializes
       // repl queues, which requires write lock
       assert hasWriteLock();
-      if (needEnter()) {
+      // if smmthread is already running, the block threshold must have been 
+      // reached before, there is no need to enter the safe mode again
+      if (smmthread == null && needEnter()) {
         enter();
         // check if we are ready to initialize replication queues
         if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
@@ -4547,7 +4549,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
         reportStatus("STATE* Safe mode ON.", false);
         return;
       }
-      // the threshold is reached
+      // the threshold is reached or was reached before
       if (!isOn() ||                           // safe mode is off
           extension <= 0 || threshold <= 0) {  // don't need to wait
         this.leave(); // leave safe mode
@@ -4559,9 +4561,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       }
       // start monitor
       reached = now();
-      smmthread = new Daemon(new SafeModeMonitor());
-      smmthread.start();
-      reportStatus("STATE* Safe mode extension entered.", true);
+      if (smmthread == null) {
+        smmthread = new Daemon(new SafeModeMonitor());
+        smmthread.start();
+        reportStatus("STATE* Safe mode extension entered.", true);
+      }
 
       // check if we are ready to initialize replication queues
       if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
@@ -4797,6 +4801,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
           if (safeMode.canLeave()) {
             // Leave safe mode.
             safeMode.leave();
+            smmthread = null;
             break;
           }
         } finally {
@@ -4812,7 +4817,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       if (!fsRunning) {
         LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
       }
-      smmthread = null;
     }
   }