瀏覽代碼

YARN-9486. Docker container exited with failure does not get clean up correctly. Contributed by Eric Yang

Eric Badger 6 年之前
父節點
當前提交
79d3d35398

+ 4 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerCleanup.java

@@ -95,8 +95,10 @@ public class ContainerCleanup implements Runnable {
           + " killed in store", e);
     }
 
-    // launch flag will be set to true if process already launched
-    boolean alreadyLaunched = !launch.markLaunched();
+    // launch flag will be set to true if process already launched,
+    // in process of launching, or failed to launch.
+    boolean alreadyLaunched = !launch.markLaunched() ||
+        launch.isLaunchCompleted();
     if (!alreadyLaunched) {
       LOG.info("Container " + containerIdStr + " not launched."
           + " No cleanup needed to be done");

+ 8 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerRelaunch.java

@@ -87,7 +87,14 @@ public class ContainerRelaunch extends ContainerLaunch {
       Path nmPrivateTruststorePath = (container.getCredentials().getSecretKey(
           AMSecretKeys.YARN_APPLICATION_AM_TRUSTSTORE) == null) ? null :
           getNmPrivateTruststorePath(appIdStr, containerIdStr);
-      pidFilePath = getPidFilePath(appIdStr, containerIdStr);
+      try {
+        // try to locate existing pid file.
+        pidFilePath = getPidFilePath(appIdStr, containerIdStr);
+      } catch (IOException e) {
+        // reset pid file path if it did not exist.
+        String pidFileSubpath = getPidFileSubpath(appIdStr, containerIdStr);
+        pidFilePath = dirsHandler.getLocalPathForWrite(pidFileSubpath);
+      }
 
       LOG.info("Relaunch container with "
           + "workDir = " + containerWorkDir.toString()

+ 13 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerCleanup.java

@@ -79,6 +79,7 @@ public class TestContainerCleanup {
 
     launch = mock(ContainerLaunch.class);
     launch.containerAlreadyLaunched = new AtomicBoolean(false);
+    launch.completed = new AtomicBoolean(false);
 
     launch.pidFilePath = new Path("target/" + containerId.toString() + ".pid");
     when(launch.getContainerPid()).thenReturn(containerId.toString());
@@ -105,4 +106,16 @@ public class TestContainerCleanup {
     Assert.assertEquals("signal", ContainerExecutor.Signal.TERM,
         captor.getValue().getSignal());
   }
+
+  @Test
+  public void testFailedExitCleanup() throws Exception {
+    launch.completed.set(true);
+    cleanup.run();
+    ArgumentCaptor<ContainerSignalContext> captor =
+        ArgumentCaptor.forClass(ContainerSignalContext.class);
+
+    verify(executor, Mockito.times(1)).signalContainer(captor.capture());
+    Assert.assertEquals("signal", ContainerExecutor.Signal.TERM,
+        captor.getValue().getSignal());
+  }
 }