Przeglądaj źródła

HADOOP-4278. If the primary datanode fails in DFSClent, remove it from the pipe line. (dhruba via szetszwo)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/core/trunk@705691 13f79535-47bb-0310-9956-ffa450edef68
Tsz-wo Sze 16 lat temu
rodzic
commit
0713135291
2 zmienionych plików z 24 dodań i 4 usunięć
  1. 3 0
      CHANGES.txt
  2. 21 4
      src/hdfs/org/apache/hadoop/hdfs/DFSClient.java

+ 3 - 0
CHANGES.txt

@@ -953,6 +953,9 @@ Release 0.19.0 - Unreleased
     HADOOP-4427. Adds the new queue/job commands to the manual.
     (Sreekanth Ramakrishnan via ddas)
 
+    HADOOP-4278. If the primary datanode fails in DFSClent, remove it from
+    the pipe line.  (dhruba via szetszwo)
+
 Release 0.18.2 - Unreleased
 
   BUG FIXES

+ 21 - 4
src/hdfs/org/apache/hadoop/hdfs/DFSClient.java

@@ -2448,17 +2448,34 @@ public class DFSClient implements FSConstants, java.io.Closeable {
         //
         LocatedBlock newBlock = null;
         ClientDatanodeProtocol primary =  null;
+        DatanodeInfo primaryNode = null;
         try {
           // Pick the "least" datanode as the primary datanode to avoid deadlock.
-          primary = createClientDatanodeProtocolProxy(
-              Collections.min(Arrays.asList(newnodes)), conf);
+          primaryNode = Collections.min(Arrays.asList(newnodes));
+          primary = createClientDatanodeProtocolProxy(primaryNode, conf);
           newBlock = primary.recoverBlock(block, newnodes);
         } catch (IOException e) {
           recoveryErrorCount++;
           if (recoveryErrorCount > maxRecoveryErrorCount) {
+            if (nodes.length > 1) {
+              // if the primary datanode failed, remove it from the list.
+              // The original bad datanode is left in the list because it is
+              // conservative to remove only one datanode in one iteration.
+              for (int j = 0; j < nodes.length; j++) {
+                if (nodes[j] ==  primaryNode) {
+                  errorIndex = j; // forget original bad node.
+                }
+              }
+              LOG.warn("Error Recovery for block " + block + " failed " +
+                       " because recovery from primary datanode " +
+                       primaryNode + " failed " + recoveryErrorCount +
+                       " times. Marking primary datanode as bad.");
+              recoveryErrorCount = 0; 
+              return true;          // sleep when we return from here
+            }
             String emsg = "Error Recovery for block " + block + " failed " +
                           " because recovery from primary datanode " +
-                          newnodes[0] + " failed " + recoveryErrorCount + 
+                          primaryNode + " failed " + recoveryErrorCount + 
                           " times. Aborting...";
             LOG.warn(emsg);
             lastException = new IOException(emsg);
@@ -2468,7 +2485,7 @@ public class DFSClient implements FSConstants, java.io.Closeable {
           } 
           LOG.warn("Error Recovery for block " + block + " failed " +
                    " because recovery from primary datanode " +
-                   newnodes[0] + " failed " + recoveryErrorCount +
+                   primaryNode + " failed " + recoveryErrorCount +
                    " times. Will retry...");
           return true;          // sleep when we return from here
         } finally {