Parcourir la source

YARN-10119. Option to reset AM failure count for YARN Service

Contributed by kyungwan nam.
Prabhu Joseph il y a 5 ans
Parent
commit
181e6d033e

+ 4 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/client/ServiceClient.java

@@ -1000,6 +1000,10 @@ public class ServiceClient extends AppAdminClient implements SliderExitCodes,
     submissionContext.setMaxAppAttempts(YarnServiceConf
         .getInt(YarnServiceConf.AM_RESTART_MAX, DEFAULT_AM_RESTART_MAX, app
             .getConfiguration(), conf));
+    submissionContext.setAttemptFailuresValidityInterval(YarnServiceConf
+        .getLong(YarnServiceConf.AM_FAILURES_VALIDITY_INTERVAL,
+            DEFAULT_AM_FAILURES_VALIDITY_INTERVAL, app.getConfiguration(),
+            conf));
 
     setLogAggregationContext(app, conf, submissionContext);
 

+ 3 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/conf/YarnServiceConf.java

@@ -44,6 +44,9 @@ public class YarnServiceConf {
 
   public static final String AM_RESTART_MAX = "yarn.service.am-restart.max-attempts";
   public static final int DEFAULT_AM_RESTART_MAX = 20;
+  public static final String AM_FAILURES_VALIDITY_INTERVAL =
+      "yarn.service.am-failure.validity-interval-ms";
+  public static final long DEFAULT_AM_FAILURES_VALIDITY_INTERVAL = -1;
   public static final String AM_RESOURCE_MEM = "yarn.service.am-resource.memory";
   public static final long DEFAULT_KEY_AM_RESOURCE_MEM = 1024;
 

+ 43 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestYarnNativeServices.java

@@ -938,4 +938,47 @@ public class TestYarnNativeServices extends ServiceTestUtils {
     Assert.assertEquals("Restarted service state should be STABLE",
         ServiceState.STABLE, service.getState());
   }
+
+  @Test(timeout = 200000)
+  public void testAMFailureValidity() throws Exception {
+    setupInternal(NUM_NMS);
+    ServiceClient client = createClient(getConf());
+    Service exampleApp = new Service();
+    exampleApp.setName("example-app");
+    exampleApp.setVersion("v1");
+    exampleApp.addComponent(createComponent("compa", 2, "sleep 1000"));
+    Configuration serviceConfig = new Configuration();
+    serviceConfig.setProperty(AM_RESTART_MAX, "2");
+    serviceConfig.setProperty(AM_FAILURES_VALIDITY_INTERVAL, "1000");
+    exampleApp.setConfiguration(serviceConfig);
+    client.actionCreate(exampleApp);
+    waitForServiceToBeStable(client, exampleApp);
+
+    Service appStatus1 = client.getStatus(exampleApp.getName());
+    ApplicationId exampleAppId = ApplicationId.fromString(appStatus1.getId());
+    YarnClient yarnClient = createYarnClient(getConf());
+
+    // kill AM1
+    ApplicationReport applicationReport = yarnClient.getApplicationReport(
+        exampleAppId);
+    ApplicationAttemptReport attemptReport = yarnClient
+        .getApplicationAttemptReport(applicationReport
+            .getCurrentApplicationAttemptId());
+    yarnClient.signalToContainer(attemptReport.getAMContainerId(),
+        SignalContainerCommand.GRACEFUL_SHUTDOWN);
+    waitForServiceToBeStable(client, exampleApp);
+    Assert.assertEquals(ServiceState.STABLE, client.getStatus(
+        exampleApp.getName()).getState());
+
+    // kill AM2 after 'yarn.service.am-failure.validity-interval-ms'
+    Thread.sleep(2000);
+    applicationReport = yarnClient.getApplicationReport(exampleAppId);
+    attemptReport = yarnClient.getApplicationAttemptReport(applicationReport
+        .getCurrentApplicationAttemptId());
+    yarnClient.signalToContainer(attemptReport.getAMContainerId(),
+        SignalContainerCommand.GRACEFUL_SHUTDOWN);
+    waitForServiceToBeStable(client, exampleApp);
+    Assert.assertEquals(ServiceState.STABLE, client.getStatus(
+        exampleApp.getName()).getState());
+  }
 }