Browse Source

YARN-3753. RM failed to come up with "java.io.IOException: Wait for ZKClient creation timed out". Contributed by Jian He

Xuan 10 years ago
parent
commit
b34825b0cb

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -142,6 +142,9 @@ Release 2.7.1 - UNRELEASED
     YARN-3725. App submission via REST API is broken in secure mode due to 
     YARN-3725. App submission via REST API is broken in secure mode due to 
     Timeline DT service address is empty. (Zhijie Shen via wangda)
     Timeline DT service address is empty. (Zhijie Shen via wangda)
 
 
+    YARN-3753. RM failed to come up with "java.io.IOException: Wait for
+    ZKClient creation timed out”. (Jian He via xgong)
+
 Release 2.7.0 - 2015-04-20
 Release 2.7.0 - 2015-04-20
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 5 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java

@@ -104,6 +104,8 @@ public class ZKRMStateStore extends RMStateStore {
 
 
   private String zkHostPort = null;
   private String zkHostPort = null;
   private int zkSessionTimeout;
   private int zkSessionTimeout;
+  // wait time for zkClient to re-establish connection with zk-server.
+  private long zkResyncWaitTime;
 
 
   @VisibleForTesting
   @VisibleForTesting
   long zkRetryInterval;
   long zkRetryInterval;
@@ -234,6 +236,7 @@ public class ZKRMStateStore extends RMStateStore {
           conf.getLong(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS,
           conf.getLong(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS,
               YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS);
               YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS);
     }
     }
+    zkResyncWaitTime = zkRetryInterval * numRetries;
 
 
     zkAcl = RMZKUtils.getZKAcls(conf);
     zkAcl = RMZKUtils.getZKAcls(conf);
     zkAuths = RMZKUtils.getZKAuths(conf);
     zkAuths = RMZKUtils.getZKAuths(conf);
@@ -1081,11 +1084,11 @@ public class ZKRMStateStore extends RMStateStore {
       long startTime = System.currentTimeMillis();
       long startTime = System.currentTimeMillis();
       synchronized (ZKRMStateStore.this) {
       synchronized (ZKRMStateStore.this) {
         while (zkClient == null) {
         while (zkClient == null) {
-          ZKRMStateStore.this.wait(zkSessionTimeout);
+          ZKRMStateStore.this.wait(zkResyncWaitTime);
           if (zkClient != null) {
           if (zkClient != null) {
             break;
             break;
           }
           }
-          if (System.currentTimeMillis() - startTime > zkSessionTimeout) {
+          if (System.currentTimeMillis() - startTime > zkResyncWaitTime) {
             throw new IOException("Wait for ZKClient creation timed out");
             throw new IOException("Wait for ZKClient creation timed out");
           }
           }
         }
         }

+ 17 - 9
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java

@@ -170,10 +170,10 @@ public class TestZKRMStateStoreZKClientConnections extends
       throws Exception {
       throws Exception {
 
 
     TestZKClient zkClientTester = new TestZKClient();
     TestZKClient zkClientTester = new TestZKClient();
-    String path = "/test";
+    final String path = "/test";
     YarnConfiguration conf = new YarnConfiguration();
     YarnConfiguration conf = new YarnConfiguration();
     conf.setInt(YarnConfiguration.RM_ZK_TIMEOUT_MS, ZK_TIMEOUT_MS);
     conf.setInt(YarnConfiguration.RM_ZK_TIMEOUT_MS, ZK_TIMEOUT_MS);
-    ZKRMStateStore store =
+    final ZKRMStateStore store =
         (ZKRMStateStore) zkClientTester.getRMStateStore(conf);
         (ZKRMStateStore) zkClientTester.getRMStateStore(conf);
     TestDispatcher dispatcher = new TestDispatcher();
     TestDispatcher dispatcher = new TestDispatcher();
     store.setRMDispatcher(dispatcher);
     store.setRMDispatcher(dispatcher);
@@ -185,14 +185,20 @@ public class TestZKRMStateStoreZKClientConnections extends
     store.setDataWithRetries(path, "newBytes".getBytes(), 0);
     store.setDataWithRetries(path, "newBytes".getBytes(), 0);
 
 
     stopServer();
     stopServer();
+    final AtomicBoolean isSucceeded = new AtomicBoolean(false);
     zkClientTester.watcher.waitForDisconnected(ZK_OP_WAIT_TIME);
     zkClientTester.watcher.waitForDisconnected(ZK_OP_WAIT_TIME);
-    try {
-      store.getDataWithRetries(path, true);
-      fail("Expected ZKClient time out exception");
-    } catch (Exception e) {
-      assertTrue(e.getMessage().contains(
-          "Wait for ZKClient creation timed out"));
-    }
+    Thread thread = new Thread() {
+      @Override
+      public void run() {
+        try {
+          store.getDataWithRetries(path, true);
+          isSucceeded.set(true);
+        } catch (Exception e) {
+          isSucceeded.set(false);
+        }
+      }
+    };
+    thread.start();
 
 
     // ZKRMStateStore Session restored
     // ZKRMStateStore Session restored
     startServer();
     startServer();
@@ -206,6 +212,8 @@ public class TestZKRMStateStoreZKClientConnections extends
       fail(error);
       fail(error);
     }
     }
     assertEquals("newBytes", new String(ret));
     assertEquals("newBytes", new String(ret));
+    thread.join();
+    assertTrue(isSucceeded.get());
   }
   }
 
 
   @Test(timeout = 20000)
   @Test(timeout = 20000)