Sfoglia il codice sorgente

HADOOP-581. Fix datanode to not reset itself on commmunications errors with namenode. Contributed by Owen.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@453776 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 18 anni fa
parent
commit
a7ce55edc2
2 ha cambiato i file con 28 aggiunte e 16 eliminazioni
  1. 6 0
      CHANGES.txt
  2. 22 16
      src/java/org/apache/hadoop/dfs/DataNode.java

+ 6 - 0
CHANGES.txt

@@ -150,6 +150,12 @@ Trunk (unreleased changes)
     This should improve some DFS namenode failure modes.
     (omalley via cutting)
 
+36. HADOOP-581.  Fix datanode to not reset itself on communications
+    errors with the namenode.  If a request to the namenode fails, the
+    datanode should retry, not restart.  This reduces the load on the
+    namenode, since restarts cause a resend of the block report.
+    (omalley via cutting)
+
 
 Release 0.6.2 - 2006-09-18
 

+ 22 - 16
src/java/org/apache/hadoop/dfs/DataNode.java

@@ -323,8 +323,8 @@ public class DataNode implements FSConstants, Runnable {
       // Now loop for a long time....
       //
 
-      try {
-        while (shouldRun) {
+      while (shouldRun) {
+        try {
           long now = System.currentTimeMillis();
 
           //
@@ -411,11 +411,15 @@ public class DataNode implements FSConstants, Runnable {
               // Send newly-received blockids to namenode
               //
               blockArray = (Block[]) receivedBlockList.toArray(new Block[receivedBlockList.size()]);
-              receivedBlockList.removeAllElements();
             }
           }
           if( blockArray != null ) {
             namenode.blockReceived( dnRegistration, blockArray );
+            synchronized (receivedBlockList) {
+              for(Block b: blockArray) {
+                receivedBlockList.remove(b);
+              }
+            }
           }
             
           //
@@ -431,19 +435,22 @@ public class DataNode implements FSConstants, Runnable {
               }
             }
           } // synchronized
-        } // while (shouldRun)
-      } catch(DiskErrorException e) {
-        handleDiskError(e.getLocalizedMessage());
-      } catch( RemoteException re ) {
-        String reClass = re.getClassName();
-        if( UnregisteredDatanodeException.class.getName().equals( reClass )) {
-          LOG.warn( "DataNode is shutting down: " + 
-                    StringUtils.stringifyException(re));
-          shutdown();
+        } catch(DiskErrorException e) {
+          handleDiskError(e.getLocalizedMessage());
           return;
+        } catch( RemoteException re ) {
+          String reClass = re.getClassName();
+          if( UnregisteredDatanodeException.class.getName().equals( reClass )) {
+            LOG.warn( "DataNode is shutting down: " + 
+                      StringUtils.stringifyException(re));
+            shutdown();
+            return;
+          }
+          LOG.warn(StringUtils.stringifyException(re));
+        } catch (IOException e) {
+          LOG.warn(StringUtils.stringifyException(e));
         }
-        throw re;
-      }
+      } // while (shouldRun)
     } // offerService
 
     
@@ -968,9 +975,8 @@ public class DataNode implements FSConstants, Runnable {
             try {
                 offerService();
             } catch (Exception ex) {
-                LOG.info("Exception: " + ex);
+              LOG.error("Exception: " + StringUtils.stringifyException(ex));
               if (shouldRun) {
-                LOG.info("Lost connection to namenode.  Retrying...");
                 try {
                   Thread.sleep(5000);
                 } catch (InterruptedException ie) {