Browse Source

HADOOP-10535. Make the retry numbers in ActiveStandbyElector configurable. Contributed by Jing Zhao.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1589905 13f79535-47bb-0310-9956-ffa450edef68
Jing Zhao 11 years ago
parent
commit
6d4c7df434

+ 3 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -355,6 +355,9 @@ Release 2.5.0 - UNRELEASED
 
     HADOOP-10503. Move junit up to v 4.11. (cnauroth)
 
+    HADOOP-10535. Make the retry numbers in ActiveStandbyElector configurable.
+    (jing9)
+
   OPTIMIZATIONS
 
   BUG FIXES 

+ 5 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java

@@ -199,6 +199,11 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic {
       "ha.failover-controller.graceful-fence.connection.retries";
   public static final int HA_FC_GRACEFUL_FENCE_CONNECTION_RETRIES_DEFAULT = 1;
 
+  /** number of zookeeper operation retry times in ActiveStandbyElector */
+  public static final String HA_FC_ELECTOR_ZK_OP_RETRIES_KEY =
+      "ha.failover-controller.active-standby-elector.zk.op.retries";
+  public static final int HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT = 3;
+
   /* Timeout that the CLI (manual) FC waits for monitorHealth, getServiceState */
   public static final String HA_FC_CLI_CHECK_TIMEOUT_KEY =
     "ha.failover-controller.cli-check.rpc-timeout.ms";

+ 10 - 9
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java

@@ -143,7 +143,6 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
 
   public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);
 
-  static int NUM_RETRIES = 3;
   private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000;
 
   private static enum ConnectionState {
@@ -170,6 +169,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
   private final String zkLockFilePath;
   private final String zkBreadCrumbPath;
   private final String znodeWorkingDir;
+  private final int maxRetryNum;
 
   private Lock sessionReestablishLockForTests = new ReentrantLock();
   private boolean wantToBeInElection;
@@ -207,7 +207,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
   public ActiveStandbyElector(String zookeeperHostPorts,
       int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
       List<ZKAuthInfo> authInfo,
-      ActiveStandbyElectorCallback app) throws IOException,
+      ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
       HadoopIllegalArgumentException, KeeperException {
     if (app == null || acl == null || parentZnodeName == null
         || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
@@ -220,7 +220,8 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
     appClient = app;
     znodeWorkingDir = parentZnodeName;
     zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME;
-    zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;    
+    zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
+    this.maxRetryNum = maxRetryNum;
 
     // createConnection for future API calls
     createConnection();
@@ -439,7 +440,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
     LOG.debug(errorMessage);
 
     if (shouldRetry(code)) {
-      if (createRetryCount < NUM_RETRIES) {
+      if (createRetryCount < maxRetryNum) {
         LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
         ++createRetryCount;
         createLockNodeAsync();
@@ -500,7 +501,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
     LOG.debug(errorMessage);
 
     if (shouldRetry(code)) {
-      if (statRetryCount < NUM_RETRIES) {
+      if (statRetryCount < maxRetryNum) {
         ++statRetryCount;
         monitorLockNodeAsync();
         return;
@@ -735,7 +736,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
   private boolean reEstablishSession() {
     int connectionRetryCount = 0;
     boolean success = false;
-    while(!success && connectionRetryCount < NUM_RETRIES) {
+    while(!success && connectionRetryCount < maxRetryNum) {
       LOG.debug("Establishing zookeeper connection for " + this);
       try {
         createConnection();
@@ -972,14 +973,14 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
     });
   }
 
-  private static <T> T zkDoWithRetries(ZKAction<T> action)
-      throws KeeperException, InterruptedException {
+  private <T> T zkDoWithRetries(ZKAction<T> action) throws KeeperException,
+      InterruptedException {
     int retry = 0;
     while (true) {
       try {
         return action.run();
       } catch (KeeperException ke) {
-        if (shouldRetry(ke.code()) && ++retry < NUM_RETRIES) {
+        if (shouldRetry(ke.code()) && ++retry < maxRetryNum) {
           continue;
         }
         throw ke;

+ 5 - 2
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java

@@ -32,6 +32,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.HadoopIllegalArgumentException;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.ha.ActiveStandbyElector.ActiveNotFoundException;
 import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback;
 import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
@@ -341,10 +342,12 @@ public abstract class ZKFailoverController {
     Preconditions.checkArgument(zkTimeout > 0,
         "Invalid ZK session timeout %s", zkTimeout);
     
-
+    int maxRetryNum = conf.getInt(
+        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
+        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
     elector = new ActiveStandbyElector(zkQuorum,
         zkTimeout, getParentZnode(), zkAcls, zkAuths,
-        new ElectorCallbacks());
+        new ElectorCallbacks(), maxRetryNum);
   }
   
   private String getParentZnode() {

+ 6 - 3
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java

@@ -39,6 +39,7 @@ import org.junit.Assert;
 import org.mockito.Mockito;
 
 import org.apache.hadoop.HadoopIllegalArgumentException;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback;
 import org.apache.hadoop.ha.ActiveStandbyElector.ActiveNotFoundException;
 import org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
@@ -59,8 +60,9 @@ public class TestActiveStandbyElector {
     ActiveStandbyElectorTester(String hostPort, int timeout, String parent,
         List<ACL> acl, ActiveStandbyElectorCallback app) throws IOException,
         KeeperException {
-      super(hostPort, timeout, parent, acl,
-          Collections.<ZKAuthInfo>emptyList(), app);
+      super(hostPort, timeout, parent, acl, Collections
+          .<ZKAuthInfo> emptyList(), app,
+          CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
     }
 
     @Override
@@ -715,7 +717,8 @@ public class TestActiveStandbyElector {
   public void testWithoutZKServer() throws Exception {
     try {
       new ActiveStandbyElector("127.0.0.1", 2000, ZK_PARENT_NAME,
-          Ids.OPEN_ACL_UNSAFE, Collections.<ZKAuthInfo> emptyList(), mockApp);
+          Ids.OPEN_ACL_UNSAFE, Collections.<ZKAuthInfo> emptyList(), mockApp,
+          CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
       Assert.fail("Did not throw zookeeper connection loss exceptions!");
     } catch (KeeperException ke) {
       GenericTestUtils.assertExceptionContains( "ConnectionLoss", ke);

+ 4 - 3
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java

@@ -26,6 +26,7 @@ import java.util.Collections;
 import java.util.UUID;
 
 import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback;
 import org.apache.hadoop.ha.ActiveStandbyElector.State;
 import org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
@@ -70,9 +71,9 @@ public class TestActiveStandbyElectorRealZK extends ClientBaseWithFixes {
     for (int i = 0; i < NUM_ELECTORS; i++) {
       cbs[i] =  Mockito.mock(ActiveStandbyElectorCallback.class);
       appDatas[i] = Ints.toByteArray(i);
-      electors[i] = new ActiveStandbyElector(
-          hostPort, 5000, PARENT_DIR, Ids.OPEN_ACL_UNSAFE,
-          Collections.<ZKAuthInfo>emptyList(), cbs[i]);
+      electors[i] = new ActiveStandbyElector(hostPort, 5000, PARENT_DIR,
+          Ids.OPEN_ACL_UNSAFE, Collections.<ZKAuthInfo> emptyList(), cbs[i],
+          CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
     }
   }
   

+ 2 - 2
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverControllerStress.java

@@ -23,6 +23,7 @@ import java.util.Random;
 
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.util.Time;
 import org.junit.After;
@@ -126,8 +127,7 @@ public class TestZKFailoverControllerStress extends ClientBaseWithFixes {
         .when(cluster.getService(0).proxy).monitorHealth();
     Mockito.doAnswer(new RandomlyThrow(1))
         .when(cluster.getService(1).proxy).monitorHealth();
-    ActiveStandbyElector.NUM_RETRIES = 100;
-    
+    conf.setInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, 100);
     // Don't start until after the above mocking. Otherwise we can get
     // Mockito errors if the HM calls the proxy in the middle of
     // setting up the mock.

+ 5 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java

@@ -23,6 +23,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.ha.ActiveStandbyElector;
 import org.apache.hadoop.ha.HAServiceProtocol;
 import org.apache.hadoop.ha.ServiceFailedException;
@@ -85,8 +86,11 @@ public class EmbeddedElectorService extends AbstractService
     List<ACL> zkAcls = RMZKUtils.getZKAcls(conf);
     List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf);
 
+    int maxRetryNum = conf.getInt(
+        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
+        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
     elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout,
-        electionZNode, zkAcls, zkAuths, this);
+        electionZNode, zkAcls, zkAuths, this, maxRetryNum);
 
     elector.ensureParentZNode();
     if (!isParentZnodeSafe(clusterId)) {