Browse Source

YARN-10822. Containers going from New to Scheduled transition for kil… (#3632)

minni31 3 years ago
parent
commit
87abc437c7

+ 13 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java

@@ -1187,10 +1187,7 @@ public class ContainerImpl implements Container {
       if (container.recoveredStatus == RecoveredContainerStatus.COMPLETED) {
       if (container.recoveredStatus == RecoveredContainerStatus.COMPLETED) {
         container.sendFinishedEvents();
         container.sendFinishedEvents();
         return ContainerState.DONE;
         return ContainerState.DONE;
-      } else if (container.recoveredStatus == RecoveredContainerStatus.QUEUED) {
-        return ContainerState.SCHEDULED;
-      } else if (container.recoveredAsKilled &&
-          container.recoveredStatus == RecoveredContainerStatus.REQUESTED) {
+      } else if (isContainerRecoveredAsKilled(container)) {
         // container was killed but never launched
         // container was killed but never launched
         container.metrics.killedContainer();
         container.metrics.killedContainer();
         NMAuditLogger.logSuccess(container.user,
         NMAuditLogger.logSuccess(container.user,
@@ -1201,6 +1198,8 @@ public class ContainerImpl implements Container {
             container.containerTokenIdentifier.getResource());
             container.containerTokenIdentifier.getResource());
         container.sendFinishedEvents();
         container.sendFinishedEvents();
         return ContainerState.DONE;
         return ContainerState.DONE;
+      } else if (container.recoveredStatus == RecoveredContainerStatus.QUEUED) {
+        return ContainerState.SCHEDULED;
       }
       }
 
 
       final ContainerLaunchContext ctxt = container.launchContext;
       final ContainerLaunchContext ctxt = container.launchContext;
@@ -1264,6 +1263,16 @@ public class ContainerImpl implements Container {
         return ContainerState.LOCALIZATION_FAILED;
         return ContainerState.LOCALIZATION_FAILED;
       }
       }
     }
     }
+
+    static boolean isContainerRecoveredAsKilled(ContainerImpl container) {
+      if (!container.recoveredAsKilled) {
+        return false;
+      }
+      // container was killed but never launched
+      RecoveredContainerStatus containerStatus = container.recoveredStatus;
+      return containerStatus == RecoveredContainerStatus.REQUESTED
+          || containerStatus == RecoveredContainerStatus.QUEUED;
+    }
   }
   }
 
 
   /**
   /**

+ 49 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java

@@ -42,6 +42,7 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashMap;
 import java.util.List;
 import java.util.List;
 import java.util.Map;
 import java.util.Map;
+import java.util.concurrent.ConcurrentMap;
 
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileContext;
 import org.apache.hadoop.fs.FileContext;
@@ -682,7 +683,49 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
     verify(cm, never()).handle(isA(CMgrCompletedAppsEvent.class));
     verify(cm, never()).handle(isA(CMgrCompletedAppsEvent.class));
   }
   }
 
 
-  private void commonLaunchContainer(ApplicationId appId, ContainerId cid,
+  @Test
+  public void testKilledContainerInQueuedStateRecovery() throws Exception {
+    conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
+    conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
+    NMStateStoreService stateStore = new NMMemoryStateStoreService();
+    stateStore.init(conf);
+    stateStore.start();
+    context = createContext(conf, stateStore);
+    ContainerManagerImpl cm = createContainerManager(context, delSrvc);
+    ((NMContext) context).setContainerManager(cm);
+    cm.init(conf);
+    cm.start();
+
+    // add an application by starting a container
+    ApplicationId appId = ApplicationId.newInstance(0, 0);
+    ApplicationAttemptId attemptId =
+        ApplicationAttemptId.newInstance(appId, 1);
+    ContainerId cid = ContainerId.newContainerId(attemptId, 1);
+    createStartContainerRequest(appId, cid, cm);
+
+    Application app = context.getApplications().get(appId);
+    assertEquals(1, context.getApplications().size());
+    assertNotNull(app);
+
+    stateStore.storeContainerKilled(cid);
+    // restart and verify container scheduler has recovered correctly
+    cm.stop();
+    context = createContext(conf, stateStore);
+    cm = createContainerManager(context, delSrvc);
+    ((NMContext) context).setContainerManager(cm);
+    cm.init(conf);
+    cm.start();
+    assertEquals(1, context.getApplications().size());
+
+    ConcurrentMap<ContainerId, Container> containers = context.getContainers();
+    Container c = containers.get(cid);
+    assertEquals(ContainerState.DONE, c.getContainerState());
+    app = context.getApplications().get(appId);
+    assertNotNull(app);
+    cm.stop();
+  }
+
+  private void createStartContainerRequest(ApplicationId appId, ContainerId cid,
       ContainerManagerImpl cm) throws Exception {
       ContainerManagerImpl cm) throws Exception {
     Map<String, String> containerEnv = new HashMap<>();
     Map<String, String> containerEnv = new HashMap<>();
     setFlowContext(containerEnv, "app_name1", appId);
     setFlowContext(containerEnv, "app_name1", appId);
@@ -727,6 +770,11 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
         context, cm, cid, clc, null, ContainerType.TASK);
         context, cm, cid, clc, null, ContainerType.TASK);
     assertTrue(startResponse.getFailedRequests().isEmpty());
     assertTrue(startResponse.getFailedRequests().isEmpty());
     assertEquals(1, context.getApplications().size());
     assertEquals(1, context.getApplications().size());
+  }
+
+  private void commonLaunchContainer(ApplicationId appId, ContainerId cid,
+      ContainerManagerImpl cm) throws Exception {
+    createStartContainerRequest(appId, cid, cm);
     // make sure the container reaches RUNNING state
     // make sure the container reaches RUNNING state
     waitForNMContainerState(cm, cid,
     waitForNMContainerState(cm, cid,
         org.apache.hadoop.yarn.server.nodemanager
         org.apache.hadoop.yarn.server.nodemanager

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java

@@ -168,6 +168,7 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
       int version, long startTime, StartContainerRequest startRequest) {
       int version, long startTime, StartContainerRequest startRequest) {
     RecoveredContainerState rcs = new RecoveredContainerState(containerId);
     RecoveredContainerState rcs = new RecoveredContainerState(containerId);
     rcs.startRequest = startRequest;
     rcs.startRequest = startRequest;
+    rcs.status = RecoveredContainerStatus.REQUESTED;
     rcs.version = version;
     rcs.version = version;
     try {
     try {
       ContainerTokenIdentifier containerTokenIdentifier = BuilderUtils
       ContainerTokenIdentifier containerTokenIdentifier = BuilderUtils