瀏覽代碼

YARN-4243. Add retry on establishing Zookeeper conenction in EmbeddedElectorService#serviceInit. Contributed by Xuan Gong.
(cherry picked from commit 0fce5f9a496925f0d53ea6c14318c9b513de9882)

Junping Du 9 年之前
父節點
當前提交
d5145b58f6

+ 49 - 4
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java

@@ -208,8 +208,49 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
    */
   public ActiveStandbyElector(String zookeeperHostPorts,
       int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
-      List<ZKAuthInfo> authInfo,
-      ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
+      List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
+      int maxRetryNum) throws IOException, HadoopIllegalArgumentException,
+      KeeperException {
+    this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl,
+      authInfo, app, maxRetryNum, true);
+  }
+
+  /**
+   * Create a new ActiveStandbyElector object <br/>
+   * The elector is created by providing to it the Zookeeper configuration, the
+   * parent znode under which to create the znode and a reference to the
+   * callback interface. <br/>
+   * The parent znode name must be the same for all service instances and
+   * different across services. <br/>
+   * After the leader has been lost, a new leader will be elected after the
+   * session timeout expires. Hence, the app must set this parameter based on
+   * its needs for failure response time. The session timeout must be greater
+   * than the Zookeeper disconnect timeout and is recommended to be 3X that
+   * value to enable Zookeeper to retry transient disconnections. Setting a very
+   * short session timeout may result in frequent transitions between active and
+   * standby states during issues like network outages/GS pauses.
+   * 
+   * @param zookeeperHostPorts
+   *          ZooKeeper hostPort for all ZooKeeper servers
+   * @param zookeeperSessionTimeout
+   *          ZooKeeper session timeout
+   * @param parentZnodeName
+   *          znode under which to create the lock
+   * @param acl
+   *          ZooKeeper ACL's
+   * @param authInfo a list of authentication credentials to add to the
+   *                 ZK connection
+   * @param app
+   *          reference to callback interface object
+   * @param failFast
+   *          whether need to add the retry when establishing ZK connection.
+   * @throws IOException
+   * @throws HadoopIllegalArgumentException
+   */
+  public ActiveStandbyElector(String zookeeperHostPorts,
+      int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
+      List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
+      int maxRetryNum, boolean failFast) throws IOException,
       HadoopIllegalArgumentException, KeeperException {
     if (app == null || acl == null || parentZnodeName == null
         || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
@@ -225,8 +266,12 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
     zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
     this.maxRetryNum = maxRetryNum;
 
-    // createConnection for future API calls
-    createConnection();
+    // establish the ZK Connection for future API calls
+    if (failFast) {
+      createConnection();
+    } else {
+      reEstablishSession();
+    }
   }
 
   /**

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -476,6 +476,9 @@ Release 2.8.0 - UNRELEASED
     YARN-3985. Make ReservationSystem persist state using RMStateStore
     reservation APIs. (adhoot via asuresh)
 
+    YARN-4243. Add retry on establishing Zookeeper conenction in 
+    EmbeddedElectorService#serviceInit. (Xuan Gong via junping_du)
+
   OPTIMIZATIONS
 
     YARN-3339. TestDockerContainerExecutor should pull a single image and not

+ 4 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

@@ -531,6 +531,10 @@ public class YarnConfiguration extends Configuration {
   public static final int
       DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0;
 
+  /** number of zookeeper operation retry times in ActiveStandbyElector */
+  public static final String RM_HA_FC_ELECTOR_ZK_RETRIES_KEY = RM_HA_PREFIX
+      + "failover-controller.active-standby-elector.zk.retries";
+
   ////////////////////////////////
   // RM state store configs
   ////////////////////////////////

+ 7 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

@@ -388,6 +388,13 @@
     <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore</value>
   </property>
 
+  <property>
+    <description>When automatic failover is enabled, number of zookeeper
+      operation retry times in ActiveStandbyElector</description>
+    <name>yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.retries</name>
+    <!--<value>3</value>-->
+  </property>
+
   <property>
     <description>The maximum number of completed applications RM state
     store keeps, less than or equals to ${yarn.resourcemanager.max-completed-applications}.

+ 5 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java

@@ -86,11 +86,12 @@ public class EmbeddedElectorService extends AbstractService
     List<ACL> zkAcls = RMZKUtils.getZKAcls(conf);
     List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf);
 
-    int maxRetryNum = conf.getInt(
-        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
-        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
+    int maxRetryNum =
+        conf.getInt(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_RETRIES_KEY, conf
+          .getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
+            CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT));
     elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout,
-        electionZNode, zkAcls, zkAuths, this, maxRetryNum);
+        electionZNode, zkAcls, zkAuths, this, maxRetryNum, false);
 
     elector.ensureParentZNode();
     if (!isParentZnodeSafe(clusterId)) {