Browse Source

YARN-11709. NodeManager should be marked unhealthy on localizer config issues (#7043)

Benjamin Teke 7 months ago
parent
commit
d1311e52f7

+ 2 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java

@@ -171,9 +171,10 @@ public abstract class ContainerExecutor implements Configurable {
    *            for starting a localizer.
    * @throws IOException for most application init failures
    * @throws InterruptedException if application init thread is halted by NM
+   * @throws ConfigurationException if config error was found
    */
   public abstract void startLocalizer(LocalizerStartContext ctx)
-    throws IOException, InterruptedException;
+      throws IOException, InterruptedException, ConfigurationException;
 
   /**
    * Prepare the container prior to the launch environment being written.

+ 22 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java

@@ -389,7 +389,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
 
   @Override
   public void startLocalizer(LocalizerStartContext ctx)
-      throws IOException, InterruptedException {
+      throws IOException, InterruptedException, ConfigurationException {
     Path nmPrivateContainerTokensPath = ctx.getNmPrivateContainerTokens();
     InetSocketAddress nmAddr = ctx.getNmAddr();
     String user = ctx.getUser();
@@ -440,9 +440,9 @@ public class LinuxContainerExecutor extends ContainerExecutor {
     localizerArgs = replaceWithContainerLogDir(localizerArgs, containerLogDir);
 
     initializeContainerOp.appendArgs(localizerArgs);
+    Configuration conf = super.getConf();
 
     try {
-      Configuration conf = super.getConf();
       PrivilegedOperationExecutor privilegedOperationExecutor =
           getPrivilegedOperationExecutor();
 
@@ -452,7 +452,26 @@ public class LinuxContainerExecutor extends ContainerExecutor {
     } catch (PrivilegedOperationException e) {
       int exitCode = e.getExitCode();
       LOG.warn("Exit code from container {} startLocalizer is : {}",
-          locId, exitCode, e);
+            locId, exitCode, e);
+
+      if (exitCode ==
+          ExitCode.INVALID_CONTAINER_EXEC_PERMISSIONS.getExitCode() ||
+          exitCode == ExitCode.INVALID_CONFIG_FILE.getExitCode()) {
+        throw new ConfigurationException("Application " + appId + " initialization failed" +
+            " (exitCode=" + exitCode + ") with an unrecoverable config error. " +
+            "Output: " + e.getOutput(), e);
+      }
+
+      // Check if the failure was due to a missing container-executor binary
+      Throwable cause = e.getCause() != null ? e.getCause() : e;
+      if (cause instanceof IOException) {
+        IOException io = (IOException) cause;
+        if (io.getMessage().contains("No such file or directory")) {
+          throw new ConfigurationException("Application " + appId + " initialization failed" +
+              "(exitCode=" + exitCode + "). Container executor not found at "
+              + getContainerExecutorExecutablePath(conf), e);
+        }
+      }
 
       throw new IOException("Application " + appId + " initialization failed" +
           " (exitCode=" + exitCode + ") with output: " + e.getOutput(), e);

+ 9 - 5
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java

@@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer;
 import static org.apache.hadoop.fs.CreateFlag.CREATE;
 import static org.apache.hadoop.fs.CreateFlag.OVERWRITE;
 
+import org.apache.hadoop.yarn.exceptions.ConfigurationException;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.RecoveryIterator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -1255,7 +1256,7 @@ public class ResourceLocalizationService extends CompositeService
       try {
         // Get nmPrivateDir
         nmPrivateCTokensPath = dirsHandler.getLocalPathForWrite(
-                NM_PRIVATE_DIR + Path.SEPARATOR + tokenFileName);
+            NM_PRIVATE_DIR + Path.SEPARATOR + tokenFileName);
 
         // 0) init queue, etc.
         // 1) write credentials to private dir
@@ -1275,10 +1276,13 @@ public class ResourceLocalizationService extends CompositeService
           throw new IOException("All disks failed. "
               + dirsHandler.getDisksHealthReport(false));
         }
-      // TODO handle ExitCodeException separately?
-      } catch (FSError fe) {
-        exception = fe;
-      } catch (Exception e) {
+        // TODO handle ExitCodeException separately?
+      } catch (ConfigurationException e) {
+        exception = e;
+        LOG.error("Failed to launch localizer for {}, due to configuration error. " +
+            "Marking the node unhealthy.", localizerId, e);
+        nmContext.getNodeStatusUpdater().reportException(e);
+      } catch (Exception | FSError e) {
         exception = e;
       } finally {
         if (exception != null) {

+ 56 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java

@@ -336,7 +336,7 @@ public class TestLinuxContainerExecutorWithMocks {
       assertThat(result.get(23)).isEqualTo("8040");
       assertThat(result.get(24)).isEqualTo("nmPrivateCTokensPath");
 
-    } catch (InterruptedException e) {
+    } catch (ConfigurationException | InterruptedException e) {
       LOG.error("Error:"+e.getMessage(),e);
       Assert.fail();
     }
@@ -643,6 +643,61 @@ public class TestLinuxContainerExecutorWithMocks {
           e.getMessage().contains("exitCode"));
     }
 
+    final int[] exitCodesToThrow = {
+        LinuxContainerExecutor.ExitCode.INVALID_CONTAINER_EXEC_PERMISSIONS.getExitCode(),
+        LinuxContainerExecutor.ExitCode.INVALID_CONFIG_FILE.getExitCode(),
+    };
+
+    for (int i = 0; i < exitCodesToThrow.length; i++) {
+      int exitCode = exitCodesToThrow[i];
+      doThrow(new PrivilegedOperationException("invalid config", exitCode, null, null))
+          .when(spyPrivilegedExecutor).executePrivilegedOperation(
+              any(), any(PrivilegedOperation.class),
+              any(), any(), anyBoolean(), anyBoolean());
+
+      try {
+        lce.startLocalizer(new LocalizerStartContext.Builder()
+            .setNmPrivateContainerTokens(nmPrivateCTokensPath)
+            .setNmAddr(address)
+            .setUser(appSubmitter)
+            .setAppId(appId.toString())
+            .setLocId("12345")
+            .setDirsHandler(dirService)
+            .build());
+        Assert.fail("startLocalizer should have thrown a ConfigurationException");
+      } catch (ConfigurationException e) {
+        assertTrue("Unexpected exception " + e,
+            e.getMessage().contains("exitCode=" + exitCode));
+      }
+    }
+
+    doThrow(new PrivilegedOperationException("IO error",
+        new IOException("No such file or directory")))
+        .when(spyPrivilegedExecutor).executePrivilegedOperation(
+            any(), any(PrivilegedOperation.class),
+            any(), any(), anyBoolean(), anyBoolean());
+
+    try {
+      lce.startLocalizer(new LocalizerStartContext.Builder()
+          .setNmPrivateContainerTokens(nmPrivateCTokensPath)
+          .setNmAddr(address)
+          .setUser(appSubmitter)
+          .setAppId(appId.toString())
+          .setLocId("12345")
+          .setDirsHandler(dirService)
+          .build());
+      Assert.fail("startLocalizer should have thrown a ConfigurationException");
+    } catch (ConfigurationException e) {
+      assertTrue("Unexpected exception " + e,
+          e.getMessage().contains("Container executor not found"));
+    }
+
+
+    doThrow(new PrivilegedOperationException("interrupted"))
+        .when(spyPrivilegedExecutor).executePrivilegedOperation(
+            any(), any(PrivilegedOperation.class),
+            any(), any(), anyBoolean(), anyBoolean());
+
     lce.activateContainer(cid, new Path(workDir, "pid.txt"));
     lce.launchContainer(new ContainerStartContext.Builder()
         .setContainer(container)