|
@@ -19,26 +19,39 @@
|
|
|
package org.apache.hadoop.yarn.server.resourcemanager;
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
+import java.util.HashMap;
|
|
|
import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
|
|
|
import junit.framework.Assert;
|
|
|
|
|
|
+import org.apache.commons.logging.Log;
|
|
|
+import org.apache.commons.logging.LogFactory;
|
|
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
|
|
import org.apache.hadoop.yarn.api.records.Container;
|
|
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
|
|
+import org.apache.hadoop.yarn.api.records.ContainerState;
|
|
|
+import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
|
|
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
|
|
+import org.apache.hadoop.yarn.event.Dispatcher;
|
|
|
+import org.apache.hadoop.yarn.event.DrainDispatcher;
|
|
|
+import org.apache.hadoop.yarn.event.EventHandler;
|
|
|
import org.apache.hadoop.yarn.server.api.records.HeartbeatResponse;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
|
|
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
|
|
|
+import org.apache.hadoop.yarn.util.BuilderUtils;
|
|
|
import org.apache.log4j.Level;
|
|
|
import org.apache.log4j.LogManager;
|
|
|
import org.apache.log4j.Logger;
|
|
|
import org.junit.Test;
|
|
|
-import org.mortbay.log.Log;
|
|
|
|
|
|
public class TestApplicationCleanup {
|
|
|
|
|
|
+ private static final Log LOG = LogFactory
|
|
|
+ .getLog(TestApplicationCleanup.class);
|
|
|
+
|
|
|
@Test
|
|
|
public void testAppCleanup() throws Exception {
|
|
|
Logger rootLogger = LogManager.getRootLogger();
|
|
@@ -67,11 +80,13 @@ public class TestApplicationCleanup {
|
|
|
List<Container> conts = am.allocate(new ArrayList<ResourceRequest>(),
|
|
|
new ArrayList<ContainerId>()).getAllocatedContainers();
|
|
|
int contReceived = conts.size();
|
|
|
- while (contReceived < request) {
|
|
|
+ int waitCount = 0;
|
|
|
+ while (contReceived < request && waitCount++ < 20) {
|
|
|
conts = am.allocate(new ArrayList<ResourceRequest>(),
|
|
|
new ArrayList<ContainerId>()).getAllocatedContainers();
|
|
|
contReceived += conts.size();
|
|
|
- Log.info("Got " + contReceived + " containers. Waiting to get " + request);
|
|
|
+ LOG.info("Got " + contReceived + " containers. Waiting to get "
|
|
|
+ + request);
|
|
|
Thread.sleep(2000);
|
|
|
}
|
|
|
Assert.assertEquals(request, conts.size());
|
|
@@ -86,11 +101,12 @@ public class TestApplicationCleanup {
|
|
|
|
|
|
//currently only containers are cleaned via this
|
|
|
//AM container is cleaned via container launcher
|
|
|
- while (cleanedConts < 2 || cleanedApps < 1) {
|
|
|
+ waitCount = 0;
|
|
|
+ while ((cleanedConts < 3 || cleanedApps < 1) && waitCount++ < 20) {
|
|
|
HeartbeatResponse resp = nm1.nodeHeartbeat(true);
|
|
|
contsToClean = resp.getContainersToCleanupList();
|
|
|
apps = resp.getApplicationsToCleanupList();
|
|
|
- Log.info("Waiting to get cleanup events.. cleanedConts: "
|
|
|
+ LOG.info("Waiting to get cleanup events.. cleanedConts: "
|
|
|
+ cleanedConts + " cleanedApps: " + cleanedApps);
|
|
|
cleanedConts += contsToClean.size();
|
|
|
cleanedApps += apps.size();
|
|
@@ -99,6 +115,130 @@ public class TestApplicationCleanup {
|
|
|
|
|
|
Assert.assertEquals(1, apps.size());
|
|
|
Assert.assertEquals(app.getApplicationId(), apps.get(0));
|
|
|
+ Assert.assertEquals(1, cleanedApps);
|
|
|
+ Assert.assertEquals(3, cleanedConts);
|
|
|
+
|
|
|
+ rm.stop();
|
|
|
+ }
|
|
|
+
|
|
|
+ @Test
|
|
|
+ public void testContainerCleanup() throws Exception {
|
|
|
+
|
|
|
+ Logger rootLogger = LogManager.getRootLogger();
|
|
|
+ rootLogger.setLevel(Level.DEBUG);
|
|
|
+ final DrainDispatcher dispatcher = new DrainDispatcher();
|
|
|
+ MockRM rm = new MockRM() {
|
|
|
+ @Override
|
|
|
+ protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
|
|
|
+ return new SchedulerEventDispatcher(this.scheduler) {
|
|
|
+ @Override
|
|
|
+ public void handle(SchedulerEvent event) {
|
|
|
+ scheduler.handle(event);
|
|
|
+ }
|
|
|
+ };
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ protected Dispatcher createDispatcher() {
|
|
|
+ return dispatcher;
|
|
|
+ }
|
|
|
+ };
|
|
|
+ rm.start();
|
|
|
+
|
|
|
+ MockNM nm1 = rm.registerNode("h1:1234", 5000);
|
|
|
+
|
|
|
+ RMApp app = rm.submitApp(2000);
|
|
|
+
|
|
|
+ //kick the scheduling
|
|
|
+ nm1.nodeHeartbeat(true);
|
|
|
+
|
|
|
+ RMAppAttempt attempt = app.getCurrentAppAttempt();
|
|
|
+ MockAM am = rm.sendAMLaunched(attempt.getAppAttemptId());
|
|
|
+ am.registerAppAttempt();
|
|
|
+
|
|
|
+ //request for containers
|
|
|
+ int request = 2;
|
|
|
+ am.allocate("h1" , 1000, request,
|
|
|
+ new ArrayList<ContainerId>());
|
|
|
+ dispatcher.await();
|
|
|
+
|
|
|
+ //kick the scheduler
|
|
|
+ nm1.nodeHeartbeat(true);
|
|
|
+ List<Container> conts = am.allocate(new ArrayList<ResourceRequest>(),
|
|
|
+ new ArrayList<ContainerId>()).getAllocatedContainers();
|
|
|
+ int contReceived = conts.size();
|
|
|
+ int waitCount = 0;
|
|
|
+ while (contReceived < request && waitCount++ < 20) {
|
|
|
+ conts = am.allocate(new ArrayList<ResourceRequest>(),
|
|
|
+ new ArrayList<ContainerId>()).getAllocatedContainers();
|
|
|
+ dispatcher.await();
|
|
|
+ contReceived += conts.size();
|
|
|
+ LOG.info("Got " + contReceived + " containers. Waiting to get "
|
|
|
+ + request);
|
|
|
+ Thread.sleep(2000);
|
|
|
+ }
|
|
|
+ Assert.assertEquals(request, conts.size());
|
|
|
+
|
|
|
+ // Release a container.
|
|
|
+ ArrayList<ContainerId> release = new ArrayList<ContainerId>();
|
|
|
+ release.add(conts.get(1).getId());
|
|
|
+ am.allocate(new ArrayList<ResourceRequest>(), release);
|
|
|
+ dispatcher.await();
|
|
|
+
|
|
|
+ // Send one more heartbeat with a fake running container. This is to
|
|
|
+ // simulate the situation that can happen if the NM reports that container
|
|
|
+ // is running in the same heartbeat when the RM asks it to clean it up.
|
|
|
+ Map<ApplicationId, List<ContainerStatus>> containerStatuses =
|
|
|
+ new HashMap<ApplicationId, List<ContainerStatus>>();
|
|
|
+ ArrayList<ContainerStatus> containerStatusList =
|
|
|
+ new ArrayList<ContainerStatus>();
|
|
|
+ containerStatusList.add(BuilderUtils.newContainerStatus(conts.get(1)
|
|
|
+ .getId(), ContainerState.RUNNING, "nothing", 0));
|
|
|
+ containerStatuses.put(app.getApplicationId(), containerStatusList);
|
|
|
+
|
|
|
+ HeartbeatResponse resp = nm1.nodeHeartbeat(containerStatuses, true);
|
|
|
+ dispatcher.await();
|
|
|
+ List<ContainerId> contsToClean = resp.getContainersToCleanupList();
|
|
|
+ int cleanedConts = contsToClean.size();
|
|
|
+ waitCount = 0;
|
|
|
+ while (cleanedConts < 1 && waitCount++ < 20) {
|
|
|
+ resp = nm1.nodeHeartbeat(true);
|
|
|
+ dispatcher.await();
|
|
|
+ contsToClean = resp.getContainersToCleanupList();
|
|
|
+ LOG.info("Waiting to get cleanup events.. cleanedConts: " + cleanedConts);
|
|
|
+ cleanedConts += contsToClean.size();
|
|
|
+ Thread.sleep(1000);
|
|
|
+ }
|
|
|
+ LOG.info("Got cleanup for " + contsToClean.get(0));
|
|
|
+ Assert.assertEquals(1, cleanedConts);
|
|
|
+
|
|
|
+ // Now to test the case when RM already gave cleanup, and NM suddenly
|
|
|
+ // realizes that the container is running.
|
|
|
+ LOG.info("Testing container launch much after release and "
|
|
|
+ + "NM getting cleanup");
|
|
|
+ containerStatuses.clear();
|
|
|
+ containerStatusList.clear();
|
|
|
+ containerStatusList.add(BuilderUtils.newContainerStatus(conts.get(1)
|
|
|
+ .getId(), ContainerState.RUNNING, "nothing", 0));
|
|
|
+ containerStatuses.put(app.getApplicationId(), containerStatusList);
|
|
|
+
|
|
|
+ resp = nm1.nodeHeartbeat(containerStatuses, true);
|
|
|
+ dispatcher.await();
|
|
|
+ contsToClean = resp.getContainersToCleanupList();
|
|
|
+ cleanedConts = contsToClean.size();
|
|
|
+ // The cleanup list won't be instantaneous as it is given out by scheduler
|
|
|
+ // and not RMNodeImpl.
|
|
|
+ waitCount = 0;
|
|
|
+ while (cleanedConts < 1 && waitCount++ < 20) {
|
|
|
+ resp = nm1.nodeHeartbeat(true);
|
|
|
+ dispatcher.await();
|
|
|
+ contsToClean = resp.getContainersToCleanupList();
|
|
|
+ LOG.info("Waiting to get cleanup events.. cleanedConts: " + cleanedConts);
|
|
|
+ cleanedConts += contsToClean.size();
|
|
|
+ Thread.sleep(1000);
|
|
|
+ }
|
|
|
+ LOG.info("Got cleanup for " + contsToClean.get(0));
|
|
|
+ Assert.assertEquals(1, cleanedConts);
|
|
|
|
|
|
rm.stop();
|
|
|
}
|