瀏覽代碼

YARN-11709. NodeManager should be shut down or blacklisted when it cacannot run program /var/lib/yarn-ce/bin/container-executor (#6960)

Ferenc Erdelyi 11 月之前
父節點
當前提交
f00094203b

+ 4 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java

@@ -451,8 +451,10 @@ public class LinuxContainerExecutor extends ContainerExecutor {
 
     } catch (PrivilegedOperationException e) {
       int exitCode = e.getExitCode();
-      LOG.warn("Exit code from container {} startLocalizer is : {}",
-          locId, exitCode, e);
+      LOG.error("Unrecoverable issue occurred. Marking the node as unhealthy to prevent "
+          + "further containers to get scheduled on the node and cause application failures. " +
+          "Exit code from the container " + locId + "startLocalizer is : " + exitCode, e);
+      nmContext.getNodeStatusUpdater().reportException(e);
 
       throw new IOException("Application " + appId + " initialization failed" +
           " (exitCode=" + exitCode + ") with output: " + e.getOutput(), e);

+ 32 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java

@@ -26,6 +26,7 @@ import static org.junit.Assert.assertTrue;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.anyBoolean;
 import static org.mockito.Mockito.doAnswer;
+import static org.mockito.Mockito.doNothing;
 import static org.mockito.Mockito.doThrow;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.spy;
@@ -37,6 +38,7 @@ import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.LineNumberReader;
+import java.lang.reflect.Field;
 import java.net.InetSocketAddress;
 import java.net.URI;
 import java.net.URISyntaxException;
@@ -345,7 +347,8 @@ public class TestLinuxContainerExecutorWithMocks {
   
   @Test
   public void testContainerLaunchError()
-      throws IOException, ContainerExecutionException, URISyntaxException {
+      throws IOException, ContainerExecutionException, URISyntaxException, IllegalAccessException,
+      NoSuchFieldException {
 
     final String[] expecetedMessage = {"badcommand", "Exit code: 24"};
     final String[] executor = {
@@ -387,6 +390,14 @@ public class TestLinuxContainerExecutorWithMocks {
       dirsHandler.init(conf);
       mockExec.setConf(conf);
 
+      //set the private nmContext field without initing the LinuxContainerExecutor
+      NodeManager nodeManager = new NodeManager();
+      NodeManager.NMContext nmContext =
+          nodeManager.createNMContext(null, null, null, false, conf);
+      Field lceNmContext = LinuxContainerExecutor.class.getDeclaredField("nmContext");
+      lceNmContext.setAccessible(true);
+      lceNmContext.set(mockExec, nmContext);
+
       String appSubmitter = "nobody";
       String cmd = String
           .valueOf(PrivilegedOperation.RunAsUserCommand.LAUNCH_CONTAINER.
@@ -601,8 +612,6 @@ public class TestLinuxContainerExecutorWithMocks {
     LinuxContainerRuntime runtime = new DefaultLinuxContainerRuntime(
         spyPrivilegedExecutor);
     runtime.initialize(conf, null);
-    mockExec = new LinuxContainerExecutor(runtime);
-    mockExec.setConf(conf);
     LinuxContainerExecutor lce = new LinuxContainerExecutor(runtime) {
       @Override
       protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
@@ -610,6 +619,23 @@ public class TestLinuxContainerExecutorWithMocks {
       }
     };
     lce.setConf(conf);
+
+    //set the private nmContext field without initing the LinuxContainerExecutor
+    NodeManager nodeManager = new NodeManager();
+    NodeManager.NMContext nmContext =
+        nodeManager.createNMContext(null, null, null, false, conf);
+    NodeManager.NMContext spyNmContext = spy(nmContext);
+
+    //initialize a mock NodeStatusUpdater
+    NodeStatusUpdaterImpl nodeStatusUpdater = mock(NodeStatusUpdaterImpl.class);
+    nmContext.setNodeStatusUpdater(nodeStatusUpdater);
+    //imitate a void method call on the NodeStatusUpdater when setting NM unhealthy.
+    doNothing().when(nodeStatusUpdater).reportException(any());
+
+    Field lceNmContext = LinuxContainerExecutor.class.getDeclaredField("nmContext");
+    lceNmContext.setAccessible(true);
+    lceNmContext.set(lce, nmContext);
+
     InetSocketAddress address = InetSocketAddress.createUnresolved(
         "localhost", 8040);
     Path nmPrivateCTokensPath= new Path("file:///bin/nmPrivateCTokensPath");
@@ -672,6 +698,9 @@ public class TestLinuxContainerExecutorWithMocks {
       assertTrue("Unexpected exception " + e,
           e.getMessage().contains("exit code"));
     }
+
+    //verify that the NM was set unhealthy on PrivilegedOperationException
+    verify(nodeStatusUpdater, times(1)).reportException(any());
   }
 
   @Test