Browse Source

YARN-2019. Retrospect on decision of making RM crashed if any exception throw in ZKRMStateStore. Contributed by Jian He.
(cherry picked from commit ee98d6354bbbcd0832d3e539ee097f837e5d0e31)

Junping Du 10 năm trước cách đây
mục cha
commit
6772c3f4dd

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -89,6 +89,9 @@ Release 2.8.0 - UNRELEASED
     YARN-2003. Support for Application priority : Changes in RM and Capacity 
     Scheduler. (Sunil G via wangda)
 
+    YARN-2019. Retrospect on decision of making RM crashed if any exception throw 
+    in ZKRMStateStore. (Jian He via junping_du)
+
   IMPROVEMENTS
 
     YARN-644. Basic null check is not performed on passed in arguments before

+ 11 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

@@ -401,6 +401,11 @@ public class YarnConfiguration extends Configuration {
   public static final String RECOVERY_ENABLED = RM_PREFIX + "recovery.enabled";
   public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
 
+  public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
+  public static final boolean DEFAULT_YARN_FAIL_FAST = true;
+
+  public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";
+
   @Private
   public static final String RM_WORK_PRESERVING_RECOVERY_ENABLED = RM_PREFIX
       + "work-preserving-recovery.enabled";
@@ -2018,6 +2023,12 @@ public class YarnConfiguration extends Configuration {
             YARN_HTTP_POLICY_DEFAULT));
   }
 
+  public static boolean shouldRMFailFast(Configuration conf) {
+    return conf.getBoolean(YarnConfiguration.RM_FAIL_FAST,
+        conf.getBoolean(YarnConfiguration.YARN_FAIL_FAST,
+            YarnConfiguration.DEFAULT_YARN_FAIL_FAST));
+  }
+
   @Private
   public static String getClusterId(Configuration conf) {
     String clusterId = conf.get(YarnConfiguration.RM_CLUSTER_ID);

+ 16 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

@@ -323,6 +323,22 @@
     <value>false</value>
   </property>
 
+  <property>
+    <description>Should RM fail fast if it encounters any errors. By defalt, it
+      points to ${yarn.fail-fast}. Errors include:
+      1) exceptions when state-store write/read operations fails.
+    </description>
+    <name>yarn.resourcemanager.fail-fast</name>
+    <value>${yarn.fail-fast}</value>
+  </property>
+
+  <property>
+    <description>Should YARN fail fast if it encounters any errors.
+    </description>
+    <name>yarn.fail-fast</name>
+    <value>true</value>
+  </property>
+
   <property>
     <description>Enable RM work preserving recovery. This configuration is private
     to YARN for experimenting the feature.

+ 7 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java

@@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
 import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.event.AsyncDispatcher;
 import org.apache.hadoop.yarn.event.Dispatcher;
 import org.apache.hadoop.yarn.event.EventHandler;
@@ -855,6 +856,7 @@ public abstract class RMStateStore extends AbstractService {
    * @param failureCause the exception due to which the operation failed
    */
   protected void notifyStoreOperationFailed(Exception failureCause) {
+    LOG.error("State store operation failed ", failureCause);
     if (failureCause instanceof StoreFencedException) {
       updateFencedState();
       Thread standByTransitionThread =
@@ -862,8 +864,11 @@ public abstract class RMStateStore extends AbstractService {
       standByTransitionThread.setName("StandByTransitionThread Handler");
       standByTransitionThread.start();
     } else {
-      rmDispatcher.getEventHandler().handle(
-        new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, failureCause));
+      if (YarnConfiguration.shouldRMFailFast(getConfig())) {
+        rmDispatcher.getEventHandler().handle(
+            new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
+                failureCause));
+      }
     }
   }