Explorar o código

YARN-2359. Application hangs when it fails to launch AM container. (Zhihai Xu via kasha)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1616375 13f79535-47bb-0310-9956-ffa450edef68
Karthik Kambatla %!s(int64=10) %!d(string=hai) anos
pai
achega
8feddc4c84

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -137,6 +137,9 @@ Release 2.6.0 - UNRELEASED
     YARN-2374. Fixed TestDistributedShell#testDSShell failure due to hostname
     dismatch. (Varun Vasudev via jianhe)
 
+    YARN-2359. Application hangs when it fails to launch AM container. 
+    (Zhihai Xu via kasha)
+
 Release 2.5.0 - UNRELEASED
 
   INCOMPATIBLE CHANGES

+ 7 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java

@@ -217,7 +217,13 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
           RMAppAttemptEventType.KILL,
           new FinalSavingTransition(new BaseFinalTransition(
             RMAppAttemptState.KILLED), RMAppAttemptState.KILLED))
-          
+      .addTransition(RMAppAttemptState.SCHEDULED,
+          RMAppAttemptState.FINAL_SAVING,
+          RMAppAttemptEventType.CONTAINER_FINISHED,
+          new FinalSavingTransition(
+            new AMContainerCrashedBeforeRunningTransition(),
+            RMAppAttemptState.FAILED))
+
        // Transitions from ALLOCATED_SAVING State
       .addTransition(RMAppAttemptState.ALLOCATED_SAVING, 
           RMAppAttemptState.ALLOCATED,

+ 27 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java

@@ -89,6 +89,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAlloca
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
@@ -782,6 +783,32 @@ public class TestRMAppAttemptTransitions {
     testAppAttemptKilledState(null, EMPTY_DIAGNOSTICS);
   }
 
+  @Test
+  public void testAMCrashAtScheduled() {
+    // This is to test sending CONTAINER_FINISHED event at SCHEDULED state.
+    // Verify the state transition is correct.
+    scheduleApplicationAttempt();
+    ContainerStatus cs =
+        SchedulerUtils.createAbnormalContainerStatus(
+            BuilderUtils.newContainerId(
+                applicationAttempt.getAppAttemptId(), 1),
+            SchedulerUtils.LOST_CONTAINER);
+    // send CONTAINER_FINISHED event at SCHEDULED state,
+    // The state should be FINAL_SAVING with previous state SCHEDULED
+    applicationAttempt.handle(new RMAppAttemptContainerFinishedEvent(
+        applicationAttempt.getAppAttemptId(), cs));
+    // createApplicationAttemptState will return previous state (SCHEDULED),
+    // if the current state is FINAL_SAVING.
+    assertEquals(YarnApplicationAttemptState.SCHEDULED,
+        applicationAttempt.createApplicationAttemptState());
+    // send ATTEMPT_UPDATE_SAVED event,
+    // verify the state is changed to state FAILED.
+    sendAttemptUpdateSavedEvent(applicationAttempt);
+    assertEquals(RMAppAttemptState.FAILED,
+        applicationAttempt.getAppAttemptState());
+    verifyApplicationAttemptFinished(RMAppAttemptState.FAILED);
+  }
+
   @Test
   public void testAllocatedToKilled() {
     Container amContainer = allocateApplicationAttempt();