Ver código fonte

YARN-2846. Incorrect persist exit code for running containers in reacquireContainer() that interrupted by NodeManager restart. Contributed by Junping Du
(cherry picked from commit 33ea5ae92b9dd3abace104903d9a94d17dd75af5)

Jason Lowe 10 anos atrás
pai
commit
6fd547da8f

+ 4 - 0
hadoop-yarn-project/CHANGES.txt

@@ -931,6 +931,10 @@ Release 2.6.0 - 2014-11-15
     YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via
     YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via
     zjshen)
     zjshen)
 
 
+    YARN-2846. Incorrect persist exit code for running containers in
+    reacquireContainer() that interrupted by NodeManager restart. (Junping Du
+    via jlowe)
+
 Release 2.5.2 - 2014-11-10
 Release 2.5.2 - 2014-11-10
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 7 - 14
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java

@@ -159,9 +159,10 @@ public abstract class ContainerExecutor implements Configurable {
    * @param containerId The ID of the container to reacquire
    * @param containerId The ID of the container to reacquire
    * @return The exit code of the pre-existing container
    * @return The exit code of the pre-existing container
    * @throws IOException
    * @throws IOException
+   * @throws InterruptedException 
    */
    */
   public int reacquireContainer(String user, ContainerId containerId)
   public int reacquireContainer(String user, ContainerId containerId)
-      throws IOException {
+      throws IOException, InterruptedException {
     Path pidPath = getPidFilePath(containerId);
     Path pidPath = getPidFilePath(containerId);
     if (pidPath == null) {
     if (pidPath == null) {
       LOG.warn(containerId + " is not active, returning terminated error");
       LOG.warn(containerId + " is not active, returning terminated error");
@@ -175,13 +176,8 @@ public abstract class ContainerExecutor implements Configurable {
     }
     }
 
 
     LOG.info("Reacquiring " + containerId + " with pid " + pid);
     LOG.info("Reacquiring " + containerId + " with pid " + pid);
-    try {
-      while(isContainerProcessAlive(user, pid)) {
-        Thread.sleep(1000);
-      }
-    } catch (InterruptedException e) {
-      throw new IOException("Interrupted while waiting for process " + pid
-          + " to exit", e);
+    while(isContainerProcessAlive(user, pid)) {
+      Thread.sleep(1000);
     }
     }
 
 
     // wait for exit code file to appear
     // wait for exit code file to appear
@@ -194,12 +190,9 @@ public abstract class ContainerExecutor implements Configurable {
         LOG.info(containerId + " was deactivated");
         LOG.info(containerId + " was deactivated");
         return ExitCode.TERMINATED.getExitCode();
         return ExitCode.TERMINATED.getExitCode();
       }
       }
-      try {
-        Thread.sleep(sleepMsec);
-      } catch (InterruptedException e) {
-        throw new IOException(
-            "Interrupted while waiting for exit code from " + containerId, e);
-      }
+      
+      Thread.sleep(sleepMsec);
+      
       msecLeft -= sleepMsec;
       msecLeft -= sleepMsec;
     }
     }
     if (msecLeft < 0) {
     if (msecLeft < 0) {

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java

@@ -345,7 +345,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
 
 
   @Override
   @Override
   public int reacquireContainer(String user, ContainerId containerId)
   public int reacquireContainer(String user, ContainerId containerId)
-      throws IOException {
+      throws IOException, InterruptedException {
     try {
     try {
       return super.reacquireContainer(user, containerId);
       return super.reacquireContainer(user, containerId);
     } finally {
     } finally {

+ 13 - 7
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java

@@ -73,6 +73,7 @@ public class RecoveredContainerLaunch extends ContainerLaunch {
     dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
     dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
         ContainerEventType.CONTAINER_LAUNCHED));
         ContainerEventType.CONTAINER_LAUNCHED));
 
 
+    boolean notInterrupted = true;
     try {
     try {
       File pidFile = locatePidFile(appIdStr, containerIdStr);
       File pidFile = locatePidFile(appIdStr, containerIdStr);
       if (pidFile != null) {
       if (pidFile != null) {
@@ -85,14 +86,19 @@ public class RecoveredContainerLaunch extends ContainerLaunch {
       }
       }
     } catch (IOException e) {
     } catch (IOException e) {
         LOG.error("Unable to recover container " + containerIdStr, e);
         LOG.error("Unable to recover container " + containerIdStr, e);
+    } catch (InterruptedException e) {
+      LOG.warn("Interrupted while waiting for exit code from " + containerId);
+      notInterrupted = false;
     } finally {
     } finally {
-      this.completed.set(true);
-      exec.deactivateContainer(containerId);
-      try {
-        getContext().getNMStateStore().storeContainerCompleted(containerId,
-            retCode);
-      } catch (IOException e) {
-        LOG.error("Unable to set exit code for container " + containerId);
+      if (notInterrupted) {
+        this.completed.set(true);
+        exec.deactivateContainer(containerId);
+        try {
+          getContext().getNMStateStore().storeContainerCompleted(containerId,
+              retCode);
+        } catch (IOException e) {
+          LOG.error("Unable to set exit code for container " + containerId);
+        }
       }
       }
     }
     }