浏览代码

YARN-2846. Incorrect persist exit code for running containers in reacquireContainer() that interrupted by NodeManager restart. Contributed by Junping Du

(cherry picked from commit 33ea5ae92b9dd3abace104903d9a94d17dd75af5)
Jason Lowe 10 年之前
父节点
当前提交
9820c47574

+ 4 - 0
hadoop-yarn-project/CHANGES.txt

@@ -869,6 +869,10 @@ Release 2.6.0 - 2014-11-15
     YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via
     YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via
     zjshen)
     zjshen)
 
 
+    YARN-2846. Incorrect persist exit code for running containers in
+    reacquireContainer() that interrupted by NodeManager restart. (Junping Du
+    via jlowe)
+
 Release 2.5.2 - 2014-11-10
 Release 2.5.2 - 2014-11-10
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 7 - 14
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java

@@ -159,9 +159,10 @@ public abstract class ContainerExecutor implements Configurable {
    * @param containerId The ID of the container to reacquire
    * @param containerId The ID of the container to reacquire
    * @return The exit code of the pre-existing container
    * @return The exit code of the pre-existing container
    * @throws IOException
    * @throws IOException
+   * @throws InterruptedException 
    */
    */
   public int reacquireContainer(String user, ContainerId containerId)
   public int reacquireContainer(String user, ContainerId containerId)
-      throws IOException {
+      throws IOException, InterruptedException {
     Path pidPath = getPidFilePath(containerId);
     Path pidPath = getPidFilePath(containerId);
     if (pidPath == null) {
     if (pidPath == null) {
       LOG.warn(containerId + " is not active, returning terminated error");
       LOG.warn(containerId + " is not active, returning terminated error");
@@ -175,13 +176,8 @@ public abstract class ContainerExecutor implements Configurable {
     }
     }
 
 
     LOG.info("Reacquiring " + containerId + " with pid " + pid);
     LOG.info("Reacquiring " + containerId + " with pid " + pid);
-    try {
-      while(isContainerProcessAlive(user, pid)) {
-        Thread.sleep(1000);
-      }
-    } catch (InterruptedException e) {
-      throw new IOException("Interrupted while waiting for process " + pid
-          + " to exit", e);
+    while(isContainerProcessAlive(user, pid)) {
+      Thread.sleep(1000);
     }
     }
 
 
     // wait for exit code file to appear
     // wait for exit code file to appear
@@ -194,12 +190,9 @@ public abstract class ContainerExecutor implements Configurable {
         LOG.info(containerId + " was deactivated");
         LOG.info(containerId + " was deactivated");
         return ExitCode.TERMINATED.getExitCode();
         return ExitCode.TERMINATED.getExitCode();
       }
       }
-      try {
-        Thread.sleep(sleepMsec);
-      } catch (InterruptedException e) {
-        throw new IOException(
-            "Interrupted while waiting for exit code from " + containerId, e);
-      }
+      
+      Thread.sleep(sleepMsec);
+      
       msecLeft -= sleepMsec;
       msecLeft -= sleepMsec;
     }
     }
     if (msecLeft < 0) {
     if (msecLeft < 0) {

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java

@@ -345,7 +345,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
 
 
   @Override
   @Override
   public int reacquireContainer(String user, ContainerId containerId)
   public int reacquireContainer(String user, ContainerId containerId)
-      throws IOException {
+      throws IOException, InterruptedException {
     try {
     try {
       return super.reacquireContainer(user, containerId);
       return super.reacquireContainer(user, containerId);
     } finally {
     } finally {

+ 13 - 7
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java

@@ -73,6 +73,7 @@ public class RecoveredContainerLaunch extends ContainerLaunch {
     dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
     dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
         ContainerEventType.CONTAINER_LAUNCHED));
         ContainerEventType.CONTAINER_LAUNCHED));
 
 
+    boolean notInterrupted = true;
     try {
     try {
       File pidFile = locatePidFile(appIdStr, containerIdStr);
       File pidFile = locatePidFile(appIdStr, containerIdStr);
       if (pidFile != null) {
       if (pidFile != null) {
@@ -85,14 +86,19 @@ public class RecoveredContainerLaunch extends ContainerLaunch {
       }
       }
     } catch (IOException e) {
     } catch (IOException e) {
         LOG.error("Unable to recover container " + containerIdStr, e);
         LOG.error("Unable to recover container " + containerIdStr, e);
+    } catch (InterruptedException e) {
+      LOG.warn("Interrupted while waiting for exit code from " + containerId);
+      notInterrupted = false;
     } finally {
     } finally {
-      this.completed.set(true);
-      exec.deactivateContainer(containerId);
-      try {
-        getContext().getNMStateStore().storeContainerCompleted(containerId,
-            retCode);
-      } catch (IOException e) {
-        LOG.error("Unable to set exit code for container " + containerId);
+      if (notInterrupted) {
+        this.completed.set(true);
+        exec.deactivateContainer(containerId);
+        try {
+          getContext().getNMStateStore().storeContainerCompleted(containerId,
+              retCode);
+        } catch (IOException e) {
+          LOG.error("Unable to set exit code for container " + containerId);
+        }
       }
       }
     }
     }