Bläddra i källkod

HADOOP-1312. Fix a ConcurrentModificationException in NameNode that killed the heartbeat monitoring thread. Contributed by Dhruba.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@534624 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 18 år sedan
förälder
incheckning
4811a3248d
2 ändrade filer med 37 tillägg och 23 borttagningar
  1. 4 0
      CHANGES.txt
  2. 33 23
      src/java/org/apache/hadoop/dfs/FSNamesystem.java

+ 4 - 0
CHANGES.txt

@@ -312,6 +312,10 @@ Trunk (unreleased changes)
 92. HADOOP-1308.  Use generics to restrict types when classes are
     passed as parameters to JobConf methods. (Michael Bieniosek via cutting)
 
+93. HADOOP-1312.  Fix a ConcurrentModificationException in NameNode
+    that killed the heartbeat monitoring thread.
+    (Dhruba Borthakur via cutting)
+
 
 Release 0.12.3 - 2007-04-06
 

+ 33 - 23
src/java/org/apache/hadoop/dfs/FSNamesystem.java

@@ -1253,29 +1253,33 @@ class FSNamesystem implements FSConstants {
    ******************************************************/
   class LeaseMonitor implements Runnable {
     public void run() {
-      while (fsRunning) {
-        synchronized (FSNamesystem.this) {
-          synchronized (leases) {
-            Lease top;
-            while ((sortedLeases.size() > 0) &&
-                   ((top = sortedLeases.first()) != null)) {
-              if (top.expiredHardLimit()) {
-                top.releaseLocks();
-                leases.remove(top.holder);
-                LOG.info("Removing lease " + top + ", leases remaining: " + sortedLeases.size());
-                if (!sortedLeases.remove(top)) {
-                  LOG.info("Unknown failure trying to remove " + top + " from lease set.");
+      try {
+        while (fsRunning) {
+          synchronized (FSNamesystem.this) {
+            synchronized (leases) {
+              Lease top;
+              while ((sortedLeases.size() > 0) &&
+                     ((top = sortedLeases.first()) != null)) {
+                if (top.expiredHardLimit()) {
+                  top.releaseLocks();
+                  leases.remove(top.holder);
+                  LOG.info("Removing lease " + top + ", leases remaining: " + sortedLeases.size());
+                  if (!sortedLeases.remove(top)) {
+                    LOG.info("Unknown failure trying to remove " + top + " from lease set.");
+                  }
+                } else {
+                  break;
                 }
-              } else {
-                break;
               }
             }
           }
+          try {
+            Thread.sleep(2000);
+          } catch (InterruptedException ie) {
+          }
         }
-        try {
-          Thread.sleep(2000);
-        } catch (InterruptedException ie) {
-        }
+      } catch (Exception e) {
+        FSNamesystem.LOG.error(StringUtils.stringifyException(e));
       }
     }
   }
@@ -1636,7 +1640,11 @@ class FSNamesystem implements FSConstants {
      */
     public void run() {
       while (fsRunning) {
-        heartbeatCheck();
+        try {
+          heartbeatCheck();
+        } catch (Exception e) {
+          FSNamesystem.LOG.error(StringUtils.stringifyException(e));
+        }
         try {
           Thread.sleep(heartbeatRecheckInterval);
         } catch (InterruptedException ie) {
@@ -1809,10 +1817,12 @@ class FSNamesystem implements FSConstants {
    * @author hairong
    */
   private void removeDatanode(DatanodeDescriptor nodeInfo) {
-    if (nodeInfo.isAlive) {
-      updateStats(nodeInfo, false);
-      heartbeats.remove(nodeInfo);
-      nodeInfo.isAlive = false;
+    synchronized (heartbeats) {
+      if (nodeInfo.isAlive) {
+        updateStats(nodeInfo, false);
+        heartbeats.remove(nodeInfo);
+        nodeInfo.isAlive = false;
+      }
     }
 
     for (Iterator<Block> it = nodeInfo.getBlockIterator(); it.hasNext();) {