Browse Source

YARN-542. Changed the default global AM max-attempts value to be not one. Contributed by Zhijie Shen.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1470094 13f79535-47bb-0310-9956-ffa450edef68
Vinod Kumar Vavilapalli 12 years ago
parent
commit
a91067fc5e

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -164,6 +164,9 @@ Release 2.0.5-beta - UNRELEASED
     YARN-586. Fixed a typo in ApplicationSubmissionContext#setApplicationId.
     YARN-586. Fixed a typo in ApplicationSubmissionContext#setApplicationId.
     (Zhijie Shen via vinodkv)
     (Zhijie Shen via vinodkv)
 
 
+    YARN-542. Changed the default global AM max-attempts value to be not one.
+    (Zhijie Shen via vinodkv)
+
   OPTIMIZATIONS
   OPTIMIZATIONS
 
 
   BUG FIXES
   BUG FIXES

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

@@ -186,7 +186,7 @@ public class YarnConfiguration extends Configuration {
    */
    */
   public static final String RM_AM_MAX_ATTEMPTS =
   public static final String RM_AM_MAX_ATTEMPTS =
     RM_PREFIX + "am.max-attempts";
     RM_PREFIX + "am.max-attempts";
-  public static final int DEFAULT_RM_AM_MAX_ATTEMPTS = 1;
+  public static final int DEFAULT_RM_AM_MAX_ATTEMPTS = 2;
   
   
   /** The keytab for the resource manager.*/
   /** The keytab for the resource manager.*/
   public static final String RM_KEYTAB = 
   public static final String RM_KEYTAB = 

+ 3 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

@@ -145,9 +145,10 @@
     setting for all application masters. Each application master can specify
     setting for all application masters. Each application master can specify
     its individual maximum number of application attempts via the API, but the
     its individual maximum number of application attempts via the API, but the
     individual number cannot be more than the global upper bound. If it is,
     individual number cannot be more than the global upper bound. If it is,
-    the resourcemanager will override it.</description>
+    the resourcemanager will override it. The default number is set to 2, to
+    allow at least one retry for AM.</description>
     <name>yarn.resourcemanager.am.max-attempts</name>
     <name>yarn.resourcemanager.am.max-attempts</name>
-    <value>1</value>
+    <value>2</value>
   </property>
   </property>
 
 
   <property>
   <property>

+ 6 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java

@@ -64,7 +64,9 @@ public class TestRMRestart {
     "org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore");
     "org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore");
     conf.set(YarnConfiguration.RM_SCHEDULER, 
     conf.set(YarnConfiguration.RM_SCHEDULER, 
     "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler");
     "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler");
-    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 5);
+    Assert.assertTrue(YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS > 1);
+    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
+        YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
 
 
     MemoryRMStateStore memStore = new MemoryRMStateStore();
     MemoryRMStateStore memStore = new MemoryRMStateStore();
     memStore.init(conf);
     memStore.init(conf);
@@ -321,7 +323,9 @@ public class TestRMRestart {
     conf.set(YarnConfiguration.RECOVERY_ENABLED, "true");
     conf.set(YarnConfiguration.RECOVERY_ENABLED, "true");
     conf.set(YarnConfiguration.RM_STORE, 
     conf.set(YarnConfiguration.RM_STORE, 
     "org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore");
     "org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore");
-    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
+    Assert.assertTrue(YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS > 1);
+    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
+        YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
 
 
     MemoryRMStateStore memStore = new MemoryRMStateStore();
     MemoryRMStateStore memStore = new MemoryRMStateStore();
     memStore.init(conf);
     memStore.init(conf);

+ 4 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java

@@ -62,7 +62,8 @@ public class TestRMAppTransitions {
   static final Log LOG = LogFactory.getLog(TestRMAppTransitions.class);
   static final Log LOG = LogFactory.getLog(TestRMAppTransitions.class);
 
 
   private RMContext rmContext;
   private RMContext rmContext;
-  private static int maxAppAttempts = 4;
+  private static int maxAppAttempts =
+      YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS;
   private static int appId = 1;
   private static int appId = 1;
   private DrainDispatcher rmDispatcher;
   private DrainDispatcher rmDispatcher;
 
 
@@ -499,6 +500,7 @@ public class TestRMAppTransitions {
 
 
     RMApp application = testCreateAppAccepted(null);
     RMApp application = testCreateAppAccepted(null);
     // ACCEPTED => ACCEPTED event RMAppEventType.RMAppEventType.ATTEMPT_FAILED
     // ACCEPTED => ACCEPTED event RMAppEventType.RMAppEventType.ATTEMPT_FAILED
+    Assert.assertTrue(maxAppAttempts > 1);
     for (int i=1; i < maxAppAttempts; i++) {
     for (int i=1; i < maxAppAttempts; i++) {
       RMAppEvent event = 
       RMAppEvent event = 
           new RMAppFailedAttemptEvent(application.getApplicationId(), 
           new RMAppFailedAttemptEvent(application.getApplicationId(), 
@@ -562,6 +564,7 @@ public class TestRMAppTransitions {
     Assert.assertEquals(expectedAttemptId, 
     Assert.assertEquals(expectedAttemptId, 
         appAttempt.getAppAttemptId().getAttemptId());
         appAttempt.getAppAttemptId().getAttemptId());
     // RUNNING => FAILED/RESTARTING event RMAppEventType.ATTEMPT_FAILED
     // RUNNING => FAILED/RESTARTING event RMAppEventType.ATTEMPT_FAILED
+    Assert.assertTrue(maxAppAttempts > 1);
     for (int i=1; i<maxAppAttempts; i++) {
     for (int i=1; i<maxAppAttempts; i++) {
       RMAppEvent event = 
       RMAppEvent event = 
           new RMAppFailedAttemptEvent(application.getApplicationId(), 
           new RMAppFailedAttemptEvent(application.getApplicationId(), 

+ 5 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesApps.java

@@ -83,7 +83,8 @@ public class TestRMWebServicesApps extends JerseyTest {
       bind(RMWebServices.class);
       bind(RMWebServices.class);
       bind(GenericExceptionHandler.class);
       bind(GenericExceptionHandler.class);
       Configuration conf = new Configuration();
       Configuration conf = new Configuration();
-      conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
+      conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
+          YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
       conf.setClass(YarnConfiguration.RM_SCHEDULER, FifoScheduler.class,
       conf.setClass(YarnConfiguration.RM_SCHEDULER, FifoScheduler.class,
           ResourceScheduler.class);
           ResourceScheduler.class);
       rm = new MockRM(conf);
       rm = new MockRM(conf);
@@ -871,8 +872,10 @@ public class TestRMWebServicesApps extends JerseyTest {
     MockNM amNodeManager = rm.registerNode("amNM:1234", 2048);
     MockNM amNodeManager = rm.registerNode("amNM:1234", 2048);
     RMApp app1 = rm.submitApp(1024, "testwordcount", "user1");
     RMApp app1 = rm.submitApp(1024, "testwordcount", "user1");
     amNodeManager.nodeHeartbeat(true);
     amNodeManager.nodeHeartbeat(true);
-    int maxAppAttempts = rm.getConfig().getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
+    int maxAppAttempts = rm.getConfig().getInt(
+        YarnConfiguration.RM_AM_MAX_ATTEMPTS,
         YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
         YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
+    assertTrue(maxAppAttempts > 1);
     int retriesLeft = maxAppAttempts;
     int retriesLeft = maxAppAttempts;
     while (--retriesLeft > 0) {
     while (--retriesLeft > 0) {
       RMAppEvent event =
       RMAppEvent event =