浏览代码

YARN-4414. Nodemanager connection errors are retried at multiple levels. Contributed by Chang Li
(cherry picked from commit 13de8359a1c6d9fc78cd5013c860c1086d86176f)

Conflicts:

hadoop-yarn-project/CHANGES.txt

Jason Lowe 9 年之前
父节点
当前提交
8c8b848b95

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -52,6 +52,9 @@ Release 2.6.4 - UNRELEASED
     YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non
     network exception. (Raju Bairishetti via jianhe)
 
+    YARN-4414. Nodemanager connection errors are retried at multiple levels
+    (Chang Li via jlowe)
+
 Release 2.6.3 - 2015-12-17
 
   INCOMPATIBLE CHANGES

+ 7 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/NMProxy.java

@@ -23,6 +23,7 @@ import java.net.InetSocketAddress;
 import org.apache.hadoop.classification.InterfaceAudience.Public;
 import org.apache.hadoop.classification.InterfaceStability.Unstable;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
 import org.apache.hadoop.io.retry.RetryPolicy;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
@@ -42,8 +43,12 @@ public class NMProxy extends ServerProxy {
           YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_MAX_WAIT_MS,
           YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS,
           YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_RETRY_INTERVAL_MS);
-
-    return createRetriableProxy(conf, protocol, ugi, rpc, serverAddress,
+    Configuration confClone = new Configuration(conf);
+    confClone.setInt(
+        CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 0);
+    confClone.setInt(CommonConfigurationKeysPublic.
+            IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 0);
+    return createRetriableProxy(confClone, protocol, ugi, rpc, serverAddress,
       retryPolicy);
   }
 }

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/ServerProxy.java

@@ -76,6 +76,7 @@ public class ServerProxy {
     exceptionToPolicyMap.put(ConnectException.class, retryPolicy);
     exceptionToPolicyMap.put(NoRouteToHostException.class, retryPolicy);
     exceptionToPolicyMap.put(UnknownHostException.class, retryPolicy);
+    exceptionToPolicyMap.put(ConnectTimeoutException.class, retryPolicy);
     exceptionToPolicyMap.put(RetriableException.class, retryPolicy);
     exceptionToPolicyMap.put(SocketException.class, retryPolicy);
     exceptionToPolicyMap.put(NMNotYetReadyException.class, retryPolicy);

+ 31 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java

@@ -21,6 +21,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager;
 import java.io.IOException;
 import java.net.InetSocketAddress;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
 import org.apache.hadoop.fs.UnsupportedFileSystemException;
 import org.apache.hadoop.io.retry.UnreliableInterface;
 import org.apache.hadoop.security.SecurityUtil;
@@ -128,7 +130,7 @@ public class TestNMProxy extends BaseContainerManagerTest {
      StartContainersRequest allRequests =
          Records.newRecord(StartContainersRequest.class);
 
-    ContainerManagementProtocol proxy = getNMProxy();
+    ContainerManagementProtocol proxy = getNMProxy(conf);
 
     retryCount = 0;
     shouldThrowNMNotYetReadyException = false;
@@ -159,14 +161,40 @@ public class TestNMProxy extends BaseContainerManagerTest {
     StartContainersRequest allRequests =
         Records.newRecord(StartContainersRequest.class);
 
-    ContainerManagementProtocol proxy = getNMProxy();
+    ContainerManagementProtocol proxy = getNMProxy(conf);
 
     shouldThrowNMNotYetReadyException = false;
     retryCount = 0;
     proxy.startContainers(allRequests);
   }
 
-  private ContainerManagementProtocol getNMProxy() {
+  @Test(timeout = 20000)
+  public void testNMProxyRPCRetry() throws Exception {
+    conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_MAX_WAIT_MS, 1000);
+    conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS, 100);
+    StartContainersRequest allRequests =
+        Records.newRecord(StartContainersRequest.class);
+    Configuration newConf = new YarnConfiguration(conf);
+    newConf.setInt(
+        CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 100);
+
+    newConf.setInt(CommonConfigurationKeysPublic.
+        IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 100);
+    // connect to some dummy address so that it can trigger
+    // connection failure and RPC level retires.
+    newConf.set(YarnConfiguration.NM_ADDRESS, "1234");
+    ContainerManagementProtocol proxy = getNMProxy(newConf);
+    try {
+      proxy.startContainers(allRequests);
+      Assert.fail("should get socket exception");
+    } catch (IOException e) {
+      // socket exception should be thrown immediately, without RPC retries.
+      Assert.assertTrue(e.toString().
+          contains("Failed on local exception: java.net.SocketException"));
+    }
+  }
+
+  private ContainerManagementProtocol getNMProxy(Configuration conf) {
     ApplicationId appId = ApplicationId.newInstance(1, 1);
     ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);