瀏覽代碼

Merge 1529044 from branch-2 to branch-2.1-beta for YARN-1149. NM throws InvalidStateTransitonException: Invalid event: APPLICATION_LOG_HANDLING_FINISHED at RUNNING. Contributed by Xuan Gong.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2.1-beta@1529070 13f79535-47bb-0310-9956-ffa450edef68
Hitesh Shah 11 年之前
父節點
當前提交
3d567534d8
共有 16 個文件被更改,包括 230 次插入155 次删除
  1. 3 0
      hadoop-yarn-project/CHANGES.txt
  2. 1 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestAMRMClient.java
  3. 19 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedAppsEvent.java
  4. 2 12
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java
  5. 1 77
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
  6. 5 5
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
  7. 148 46
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
  8. 22 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java
  9. 1 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java
  10. 1 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java
  11. 1 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java
  12. 21 8
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
  13. 1 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java
  14. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java
  15. 2 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/TestApplication.java
  16. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -92,6 +92,9 @@ Release 2.1.2 - UNRELEASED
     YARN-1271. "Text file busy" errors launching containers again
     (Sandy Ryza)
 
+    YARN-1149. NM throws InvalidStateTransitonException: Invalid event:
+    APPLICATION_LOG_HANDLING_FINISHED at RUNNING (Xuan Gong via hitesh)
+
 Release 2.1.1-beta - 2013-09-23
 
   INCOMPATIBLE CHANGES

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestAMRMClient.java

@@ -107,6 +107,7 @@ public class TestAMRMClient {
     // start minicluster
     conf = new YarnConfiguration();
     conf.setInt(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS, 100);
+    conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
     yarnCluster = new MiniYARNCluster(TestAMRMClient.class.getName(), nodeCount, 1, 1);
     yarnCluster.init(conf);
     yarnCluster.start();

+ 19 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedAppsEvent.java

@@ -27,13 +27,31 @@ import org.apache.hadoop.yarn.server.nodemanager.ContainerManagerEventType;
 public class CMgrCompletedAppsEvent extends ContainerManagerEvent {
 
   private final List<ApplicationId> appsToCleanup;
+  private final Reason reason;
 
-  public CMgrCompletedAppsEvent(List<ApplicationId> appsToCleanup) {
+  public CMgrCompletedAppsEvent(List<ApplicationId> appsToCleanup, Reason reason) {
     super(ContainerManagerEventType.FINISH_APPS);
     this.appsToCleanup = appsToCleanup;
+    this.reason = reason;
   }
 
   public List<ApplicationId> getAppsToCleanup() {
     return this.appsToCleanup;
   }
+
+  public Reason getReason() {
+    return reason;
+  }
+
+  public static enum Reason {
+    /**
+     * Application is killed as NodeManager is shut down
+     */
+    ON_SHUTDOWN, 
+
+    /**
+     * Application is killed by ResourceManager
+     */
+    BY_RESOURCEMANAGER
+  }
 }

+ 2 - 12
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/CMgrCompletedContainersEvent.java

@@ -24,24 +24,14 @@ import org.apache.hadoop.yarn.api.records.ContainerId;
 
 public class CMgrCompletedContainersEvent extends ContainerManagerEvent {
 
-  private List<ContainerId> containerToCleanup;
-  private Reason reason;
+  private final List<ContainerId> containerToCleanup;
   
-  public CMgrCompletedContainersEvent(List<ContainerId> containersToCleanup, Reason reason) {
+  public CMgrCompletedContainersEvent(List<ContainerId> containersToCleanup) {
     super(ContainerManagerEventType.FINISH_CONTAINERS);
     this.containerToCleanup = containersToCleanup;
-    this.reason = reason;
   }
 
   public List<ContainerId> getContainersToCleanup() {
     return this.containerToCleanup;
   }
-  
-  public Reason getReason() {
-    return reason;
-  }
-  
-  public static enum Reason {
-    ON_SHUTDOWN, BY_RESOURCEMANAGER
-  }
 }

+ 1 - 77
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java

@@ -19,9 +19,6 @@
 package org.apache.hadoop.yarn.server.nodemanager;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.ConcurrentSkipListMap;
@@ -67,11 +64,6 @@ public class NodeManager extends CompositeService
    * Priority of the NodeManager shutdown hook.
    */
   public static final int SHUTDOWN_HOOK_PRIORITY = 30;
-  
-  /**
-   * Extra duration to wait for containers to be killed on shutdown.
-   */
-  private static final int SHUTDOWN_CLEANUP_SLOP_MS = 1000;
 
   private static final Log LOG = LogFactory.getLog(NodeManager.class);
   protected final NodeManagerMetrics metrics = NodeManagerMetrics.create();
@@ -84,8 +76,6 @@ public class NodeManager extends CompositeService
   private NodeStatusUpdater nodeStatusUpdater;
   private static CompositeServiceShutdownHook nodeManagerShutdownHook; 
   
-  private long waitForContainersOnShutdownMillis;
-  
   private AtomicBoolean isStopping = new AtomicBoolean(false);
   
   public NodeManager() {
@@ -193,13 +183,6 @@ public class NodeManager extends CompositeService
     // so that we make sure everything is up before registering with RM. 
     addService(nodeStatusUpdater);
     
-    waitForContainersOnShutdownMillis =
-        conf.getLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS,
-            YarnConfiguration.DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS) + 
-        conf.getLong(YarnConfiguration.NM_PROCESS_KILL_WAIT_MS,
-            YarnConfiguration.DEFAULT_NM_PROCESS_KILL_WAIT_MS) +
-        SHUTDOWN_CLEANUP_SLOP_MS;
-    
     super.serviceInit(conf);
     // TODO add local dirs to del
   }
@@ -219,9 +202,6 @@ public class NodeManager extends CompositeService
     if (isStopping.getAndSet(true)) {
       return;
     }
-    if (context != null) {
-      cleanupContainers(NodeManagerEventType.SHUTDOWN);
-    }
     super.serviceStop();
     DefaultMetricsSystem.shutdown();
   }
@@ -246,68 +226,12 @@ public class NodeManager extends CompositeService
       public void run() {
         LOG.info("Notifying ContainerManager to block new container-requests");
         containerManager.setBlockNewContainerRequests(true);
-        cleanupContainers(NodeManagerEventType.RESYNC);
+        containerManager.cleanUpApplications(NodeManagerEventType.RESYNC);
         ((NodeStatusUpdaterImpl) nodeStatusUpdater ).rebootNodeStatusUpdater();
       }
     }.start();
   }
 
-  @SuppressWarnings("unchecked")
-  protected void cleanupContainers(NodeManagerEventType eventType) {
-    Map<ContainerId, Container> containers = context.getContainers();
-    if (containers.isEmpty()) {
-      return;
-    }
-    LOG.info("Containers still running on " + eventType + " : "
-        + containers.keySet());
-    
-    List<ContainerId> containerIds =
-        new ArrayList<ContainerId>(containers.keySet());
-    dispatcher.getEventHandler().handle(
-        new CMgrCompletedContainersEvent(containerIds, 
-            CMgrCompletedContainersEvent.Reason.ON_SHUTDOWN));
-    
-    LOG.info("Waiting for containers to be killed");
-    
-    switch (eventType) {
-    case SHUTDOWN:
-      long waitStartTime = System.currentTimeMillis();
-      while (!containers.isEmpty()
-          && System.currentTimeMillis() - waitStartTime < waitForContainersOnShutdownMillis) {
-        try {
-          //To remove done containers in NM context
-          nodeStatusUpdater.getNodeStatusAndUpdateContainersInContext();
-          Thread.sleep(1000);
-        } catch (InterruptedException ex) {
-          LOG.warn("Interrupted while sleeping on container kill on shutdown",
-            ex);
-        }
-      }
-      break;
-    case RESYNC:
-      while (!containers.isEmpty()) {
-        try {
-          Thread.sleep(1000);
-          nodeStatusUpdater.getNodeStatusAndUpdateContainersInContext();
-        } catch (InterruptedException ex) {
-          LOG.warn("Interrupted while sleeping on container kill on resync",
-            ex);
-        }
-      }
-      break;
-    default:
-      LOG.warn("Invalid eventType: " + eventType);
-    }
-
-    // All containers killed
-    if (containers.isEmpty()) {
-      LOG.info("All containers in DONE state");
-    } else {
-      LOG.info("Done waiting for containers to be killed. Still alive: " + 
-          containers.keySet());
-    }
-  }
-
   public static class NMContext implements Context {
 
     private NodeId nodeId = null;

+ 5 - 5
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java

@@ -469,18 +469,18 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
             lastHeartBeatID = response.getResponseId();
             List<ContainerId> containersToCleanup = response
                 .getContainersToCleanup();
-            if (containersToCleanup.size() != 0) {
+            if (!containersToCleanup.isEmpty()) {
               dispatcher.getEventHandler().handle(
-                  new CMgrCompletedContainersEvent(containersToCleanup, 
-                      CMgrCompletedContainersEvent.Reason.BY_RESOURCEMANAGER));
+                  new CMgrCompletedContainersEvent(containersToCleanup));
             }
             List<ApplicationId> appsToCleanup =
                 response.getApplicationsToCleanup();
             //Only start tracking for keepAlive on FINISH_APP
             trackAppsForKeepAlive(appsToCleanup);
-            if (appsToCleanup.size() != 0) {
+            if (!appsToCleanup.isEmpty()) {
               dispatcher.getEventHandler().handle(
-                  new CMgrCompletedAppsEvent(appsToCleanup));
+                  new CMgrCompletedAppsEvent(appsToCleanup,
+                      CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
             }
           } catch (ConnectException e) {
             //catch and throw the exception if tried MAX wait time to connect RM

+ 148 - 46
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java

@@ -30,6 +30,9 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -71,6 +74,7 @@ import org.apache.hadoop.yarn.exceptions.InvalidAuxServiceException;
 import org.apache.hadoop.yarn.exceptions.InvalidContainerException;
 import org.apache.hadoop.yarn.exceptions.NMNotYetReadyException;
 import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
 import org.apache.hadoop.yarn.ipc.RPCUtil;
 import org.apache.hadoop.yarn.ipc.YarnRPC;
 import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
@@ -83,6 +87,7 @@ import org.apache.hadoop.yarn.server.nodemanager.Context;
 import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
 import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
 import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManagerEventType;
 import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger.AuditConstants;
 import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
 import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
@@ -120,6 +125,11 @@ public class ContainerManagerImpl extends CompositeService implements
     ServiceStateChangeListener, ContainerManagementProtocol,
     EventHandler<ContainerManagerEvent> {
 
+  /**
+   * Extra duration to wait for applications to be killed on shutdown.
+   */
+  private static final int SHUTDOWN_CLEANUP_SLOP_MS = 1000;
+
   private static final Log LOG = LogFactory.getLog(ContainerManagerImpl.class);
 
   final Context context;
@@ -138,6 +148,11 @@ public class ContainerManagerImpl extends CompositeService implements
 
   private final DeletionService deletionService;
   private AtomicBoolean blockNewContainerRequests = new AtomicBoolean(false);
+  private boolean serviceStopped = false;
+  private final ReadLock readLock;
+  private final WriteLock writeLock;
+
+  private long waitForContainersOnShutdownMillis;
 
   public ContainerManagerImpl(Context context, ContainerExecutor exec,
       DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
@@ -181,6 +196,10 @@ public class ContainerManagerImpl extends CompositeService implements
     dispatcher.register(ContainersLauncherEventType.class, containersLauncher);
     
     addService(dispatcher);
+
+    ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
+    this.readLock = lock.readLock();
+    this.writeLock = lock.writeLock();
   }
 
   @Override
@@ -190,6 +209,13 @@ public class ContainerManagerImpl extends CompositeService implements
     addIfService(logHandler);
     dispatcher.register(LogHandlerEventType.class, logHandler);
     
+    waitForContainersOnShutdownMillis =
+        conf.getLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS,
+            YarnConfiguration.DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS) +
+        conf.getLong(YarnConfiguration.NM_PROCESS_KILL_WAIT_MS,
+            YarnConfiguration.DEFAULT_NM_PROCESS_KILL_WAIT_MS) +
+        SHUTDOWN_CLEANUP_SLOP_MS;
+
     super.serviceInit(conf);
   }
 
@@ -275,6 +301,16 @@ public class ContainerManagerImpl extends CompositeService implements
 
   @Override
   public void serviceStop() throws Exception {
+    setBlockNewContainerRequests(true);
+    this.writeLock.lock();
+    try {
+      serviceStopped = true;
+      if (context != null) {
+        cleanUpApplications(NodeManagerEventType.SHUTDOWN);
+      }
+    } finally {
+      this.writeLock.unlock();
+    }
     if (auxiliaryServices.getServiceState() == STARTED) {
       auxiliaryServices.unregisterServiceListener(this);
     }
@@ -284,6 +320,60 @@ public class ContainerManagerImpl extends CompositeService implements
     super.serviceStop();
   }
 
+  public void cleanUpApplications(NodeManagerEventType eventType) {
+    Map<ApplicationId, Application> applications =
+        this.context.getApplications();
+    if (applications.isEmpty()) {
+      return;
+    }
+    LOG.info("Applications still running : " + applications.keySet());
+
+    List<ApplicationId> appIds =
+        new ArrayList<ApplicationId>(applications.keySet());
+    this.handle(
+        new CMgrCompletedAppsEvent(appIds,
+            CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN));
+
+    LOG.info("Waiting for Applications to be Finished");
+
+    switch (eventType) {
+      case SHUTDOWN:
+        long waitStartTime = System.currentTimeMillis();
+        while (!applications.isEmpty()
+            && System.currentTimeMillis() - waitStartTime
+                < waitForContainersOnShutdownMillis) {
+          try {
+            Thread.sleep(1000);
+          } catch (InterruptedException ex) {
+            LOG.warn("Interrupted while sleeping on applications finish on shutdown",
+              ex);
+          }
+        }
+        break;
+      case RESYNC:
+        while (!applications.isEmpty()) {
+          try {
+            Thread.sleep(1000);
+          } catch (InterruptedException ex) {
+            LOG.warn("Interrupted while sleeping on applications finish on resync",
+              ex);
+          }
+        }
+        break;
+      default:
+        throw new YarnRuntimeException("Get an unknown NodeManagerEventType: "
+            + eventType);
+    }
+
+    // All applications Finished
+    if (applications.isEmpty()) {
+      LOG.info("All applications in FINISHED state");
+    } else {
+      LOG.info("Done waiting for Applications to be Finished. Still alive: " +
+          applications.keySet());
+    }
+  }
+
   // Get the remoteUGI corresponding to the api call.
   protected UserGroupInformation getRemoteUgi()
       throws YarnException {
@@ -432,14 +522,14 @@ public class ContainerManagerImpl extends CompositeService implements
      * correct RMIdentifier. d) It is not expired.
      */
     authorizeStartRequest(nmTokenIdentifier, containerTokenIdentifier);
- 
+
     if (containerTokenIdentifier.getRMIdentifer() != nodeStatusUpdater
-        .getRMIdentifier()) {
-        // Is the container coming from unknown RM
-        StringBuilder sb = new StringBuilder("\nContainer ");
-        sb.append(containerTokenIdentifier.getContainerID().toString())
-          .append(" rejected as it is allocated by a previous RM");
-        throw new InvalidContainerException(sb.toString());
+      .getRMIdentifier()) {
+      // Is the container coming from unknown RM
+      StringBuilder sb = new StringBuilder("\nContainer ");
+      sb.append(containerTokenIdentifier.getContainerID().toString()).append(
+        " rejected as it is allocated by a previous RM");
+      throw new InvalidContainerException(sb.toString());
     }
     // update NMToken
     updateNMTokenIdentifier(nmTokenIdentifier);
@@ -453,13 +543,13 @@ public class ContainerManagerImpl extends CompositeService implements
     ContainerLaunchContext launchContext = request.getContainerLaunchContext();
 
     Map<String, ByteBuffer> serviceData = getAuxServiceMetaData();
-    if (launchContext.getServiceData()!=null && 
-        !launchContext.getServiceData().isEmpty()) {
+    if (launchContext.getServiceData() != null
+        && !launchContext.getServiceData().isEmpty()) {
       for (Map.Entry<String, ByteBuffer> meta : launchContext.getServiceData()
-          .entrySet()) {
+        .entrySet()) {
         if (null == serviceData.get(meta.getKey())) {
-          throw new InvalidAuxServiceException("The auxService:" + meta.getKey()
-              + " does not exist");
+          throw new InvalidAuxServiceException("The auxService:"
+              + meta.getKey() + " does not exist");
         }
       }
     }
@@ -479,30 +569,42 @@ public class ContainerManagerImpl extends CompositeService implements
           + " already is running on this node!!");
     }
 
-    // Create the application
-    Application application =
-        new ApplicationImpl(dispatcher, this.aclsManager, user, applicationID,
-          credentials, context);
-    if (null == context.getApplications().putIfAbsent(applicationID,
-      application)) {
-      LOG.info("Creating a new application reference for app " + applicationID);
-
-      dispatcher.getEventHandler().handle(
-        new ApplicationInitEvent(applicationID, container.getLaunchContext()
-          .getApplicationACLs()));
-    }
+    this.readLock.lock();
+    try {
+      if (!serviceStopped) {
+        // Create the application
+        Application application =
+            new ApplicationImpl(dispatcher, this.aclsManager, user,
+              applicationID, credentials, context);
+        if (null == context.getApplications().putIfAbsent(applicationID,
+          application)) {
+          LOG.info("Creating a new application reference for app "
+              + applicationID);
+
+          dispatcher.getEventHandler().handle(
+            new ApplicationInitEvent(applicationID, container
+              .getLaunchContext().getApplicationACLs()));
+        }
 
-    dispatcher.getEventHandler().handle(
-      new ApplicationContainerInitEvent(container));
+        dispatcher.getEventHandler().handle(
+          new ApplicationContainerInitEvent(container));
 
-    this.context.getContainerTokenSecretManager().startContainerSuccessful(
-      containerTokenIdentifier);
-    NMAuditLogger.logSuccess(user, AuditConstants.START_CONTAINER,
-      "ContainerManageImpl", applicationID, containerId);
-    // TODO launchedContainer misplaced -> doesn't necessarily mean a container
-    // launch. A finished Application will not launch containers.
-    metrics.launchedContainer();
-    metrics.allocateContainer(containerTokenIdentifier.getResource()); 
+        this.context.getContainerTokenSecretManager().startContainerSuccessful(
+          containerTokenIdentifier);
+        NMAuditLogger.logSuccess(user, AuditConstants.START_CONTAINER,
+          "ContainerManageImpl", applicationID, containerId);
+        // TODO launchedContainer misplaced -> doesn't necessarily mean a
+        // container
+        // launch. A finished Application will not launch containers.
+        metrics.launchedContainer();
+        metrics.allocateContainer(containerTokenIdentifier.getResource());
+      } else {
+        throw new YarnException("Container start failed as the NodeManager is "
+            + "in the process of shutting down");
+      }
+    } finally {
+      this.readLock.unlock();
+    }
   }
 
   protected ContainerTokenIdentifier verifyAndGetContainerTokenIdentifier(
@@ -727,9 +829,15 @@ public class ContainerManagerImpl extends CompositeService implements
       CMgrCompletedAppsEvent appsFinishedEvent =
           (CMgrCompletedAppsEvent) event;
       for (ApplicationId appID : appsFinishedEvent.getAppsToCleanup()) {
+        String diagnostic = "";
+        if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN) {
+          diagnostic = "Application killed on shutdown";
+        } else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) {
+          diagnostic = "Application killed by ResourceManager";
+        }
         this.dispatcher.getEventHandler().handle(
             new ApplicationFinishEvent(appID,
-                "Application Killed by ResourceManager"));
+                diagnostic));
       }
       break;
     case FINISH_CONTAINERS:
@@ -737,20 +845,14 @@ public class ContainerManagerImpl extends CompositeService implements
           (CMgrCompletedContainersEvent) event;
       for (ContainerId container : containersFinishedEvent
           .getContainersToCleanup()) {
-        String diagnostic = "";
-        if (containersFinishedEvent.getReason() == 
-            CMgrCompletedContainersEvent.Reason.ON_SHUTDOWN) {
-          diagnostic = "Container Killed on Shutdown";
-        } else if (containersFinishedEvent.getReason() == 
-            CMgrCompletedContainersEvent.Reason.BY_RESOURCEMANAGER) {
-          diagnostic = "Container Killed by ResourceManager";
-        }
-        this.dispatcher.getEventHandler().handle(
-            new ContainerKillEvent(container, diagnostic));
+          this.dispatcher.getEventHandler().handle(
+              new ContainerKillEvent(container,
+                  "Container Killed by ResourceManager"));
       }
       break;
     default:
-      LOG.warn("Invalid event " + event.getType() + ". Ignoring.");
+        throw new YarnRuntimeException(
+            "Get an unknown ContainerManagerEvent type: " + event.getType());
     }
   }
 

+ 22 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java

@@ -178,6 +178,13 @@ public class ApplicationImpl implements Application {
                    ApplicationState.APPLICATION_RESOURCES_CLEANINGUP),
                ApplicationEventType.APPLICATION_CONTAINER_FINISHED,
                new AppFinishTransition())
+          .addTransition(ApplicationState.FINISHING_CONTAINERS_WAIT,
+              ApplicationState.FINISHING_CONTAINERS_WAIT,
+              EnumSet.of(
+                  ApplicationEventType.APPLICATION_LOG_HANDLING_INITED,
+                  ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED,
+                  ApplicationEventType.APPLICATION_INITED,
+                  ApplicationEventType.FINISH_APPLICATION))
 
            // Transitions from APPLICATION_RESOURCES_CLEANINGUP state
            .addTransition(ApplicationState.APPLICATION_RESOURCES_CLEANINGUP,
@@ -187,12 +194,25 @@ public class ApplicationImpl implements Application {
                ApplicationState.FINISHED,
                ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP,
                new AppCompletelyDoneTransition())
+          .addTransition(ApplicationState.APPLICATION_RESOURCES_CLEANINGUP,
+              ApplicationState.APPLICATION_RESOURCES_CLEANINGUP,
+              EnumSet.of(
+                  ApplicationEventType.APPLICATION_LOG_HANDLING_INITED,
+                  ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED,
+                  ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED,
+                  ApplicationEventType.APPLICATION_INITED,
+                  ApplicationEventType.FINISH_APPLICATION))
            
            // Transitions from FINISHED state
            .addTransition(ApplicationState.FINISHED,
                ApplicationState.FINISHED,
                ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED,
                new AppLogsAggregatedTransition())
+           .addTransition(ApplicationState.FINISHED, ApplicationState.FINISHED,
+               EnumSet.of(
+                  ApplicationEventType.APPLICATION_LOG_HANDLING_INITED,
+                  ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED,
+                  ApplicationEventType.FINISH_APPLICATION))
                
            // create the topology tables
            .installTopology();
@@ -344,7 +364,7 @@ public class ApplicationImpl implements Application {
     @Override
     public ApplicationState transition(ApplicationImpl app,
         ApplicationEvent event) {
-
+      ApplicationFinishEvent appEvent = (ApplicationFinishEvent)event;
       if (app.containers.isEmpty()) {
         // No container to cleanup. Cleanup app level resources.
         app.handleAppFinishWithContainersCleanedup();
@@ -356,7 +376,7 @@ public class ApplicationImpl implements Application {
       for (ContainerId containerID : app.containers.keySet()) {
         app.dispatcher.getEventHandler().handle(
             new ContainerKillEvent(containerID,
-                "Container killed on application-finish event from RM."));
+                "Container killed on application-finish event: " + appEvent.getDiagnostic()));
       }
       return ApplicationState.FINISHING_CONTAINERS_WAIT;
     }

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java

@@ -288,6 +288,7 @@ public class TestNodeManagerReboot {
       conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, "127.0.0.1:12346");
       conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
       conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
+      conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
       return conf;
     }
   }

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerResync.java

@@ -143,6 +143,7 @@ public class TestNodeManagerResync {
     conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
       remoteLogsDir.getAbsolutePath());
     conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
+    conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
     return conf;
   }
 

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerShutdown.java

@@ -242,6 +242,7 @@ public class TestNodeManagerShutdown {
     conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
     conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, remoteLogsDir.getAbsolutePath());
     conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
+    conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
     return conf;
   }
   

+ 21 - 8
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java

@@ -454,13 +454,13 @@ public class TestNodeStatusUpdater {
 
     @Override
     protected void serviceStop() throws Exception {
+      System.out.println("Called stooppppp");
       super.serviceStop();
       isStopped = true;
-      ConcurrentMap<ContainerId, org.apache.hadoop.yarn.server.nodemanager
-      .containermanager.container.Container> containers =
-          getNMContext().getContainers();
-      // ensure that containers are empty
-      if(!containers.isEmpty()) {
+      ConcurrentMap<ApplicationId, Application> applications =
+          getNMContext().getApplications();
+      // ensure that applications are empty
+      if(!applications.isEmpty()) {
         assertionFailedInThread.set(true);
       }
       syncBarrier.await(10000, TimeUnit.MILLISECONDS);
@@ -857,9 +857,20 @@ public class TestNodeStatusUpdater {
       }
       
       @Override
-      protected void cleanupContainers(NodeManagerEventType eventType) {
-        super.cleanupContainers(NodeManagerEventType.SHUTDOWN);
-        numCleanups.incrementAndGet();
+      protected ContainerManagerImpl createContainerManager(Context context,
+          ContainerExecutor exec, DeletionService del,
+          NodeStatusUpdater nodeStatusUpdater,
+          ApplicationACLsManager aclsManager,
+          LocalDirsHandlerService dirsHandler) {
+        return new ContainerManagerImpl(context, exec, del, nodeStatusUpdater,
+            metrics, aclsManager, dirsHandler) {
+
+          @Override
+          public void cleanUpApplications(NodeManagerEventType eventType) {
+            super.cleanUpApplications(NodeManagerEventType.SHUTDOWN);
+            numCleanups.incrementAndGet();
+          }
+        };
       }
     };
 
@@ -1159,6 +1170,7 @@ public class TestNodeStatusUpdater {
         .RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
         connectionRetryIntervalMs);
     conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, 5000);
+    conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
     CyclicBarrier syncBarrier = new CyclicBarrier(2);
     nm = new MyNodeManager2(syncBarrier, conf);
     nm.init(conf);
@@ -1305,6 +1317,7 @@ public class TestNodeStatusUpdater {
     conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
       remoteLogsDir.getAbsolutePath());
     conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
+    conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
     return conf;
   }
   

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java

@@ -166,6 +166,7 @@ public abstract class BaseContainerManagerTest {
     conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath());
     conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, remoteLogDir.getAbsolutePath());
 
+    conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
     // Default delSrvc
     delSrvc = createDeletionService();
     delSrvc.init(conf);

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java

@@ -542,7 +542,7 @@ public class TestContainerManager extends BaseContainerManagerTest {
 
     // Simulate RM sending an AppFinish event.
     containerManager.handle(new CMgrCompletedAppsEvent(Arrays
-        .asList(new ApplicationId[] { appId })));
+        .asList(new ApplicationId[] { appId }), CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN));
 
     BaseContainerManagerTest.waitForApplicationState(containerManager, 
         cId.getApplicationAttemptId().getApplicationId(),

+ 2 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/TestApplication.java

@@ -585,8 +585,8 @@ public class TestApplication {
     }
 
     public void appFinished() {
-      app.handle(new ApplicationEvent(appId,
-          ApplicationEventType.FINISH_APPLICATION));
+      app.handle(new ApplicationFinishEvent(appId,
+          "Finish Application"));
       drainDispatcherEvents();
     }
 

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java

@@ -826,7 +826,7 @@ public class TestLogAggregationService extends BaseContainerManagerTest {
         cId, ContainerState.COMPLETE);
 
     this.containerManager.handle(new CMgrCompletedAppsEvent(Arrays
-        .asList(appId)));
+        .asList(appId), CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN));
     this.containerManager.stop();
   }