|
@@ -35,8 +35,12 @@ import org.apache.hadoop.yarn.api.records.ContainerId;
|
|
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
|
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
|
|
import org.apache.hadoop.yarn.api.records.NMToken;
|
|
|
+import org.apache.hadoop.yarn.api.records.NodeId;
|
|
|
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
|
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
|
|
+import org.apache.hadoop.yarn.event.Dispatcher;
|
|
|
+import org.apache.hadoop.yarn.event.DrainDispatcher;
|
|
|
+import org.apache.hadoop.yarn.event.EventHandler;
|
|
|
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
|
|
@@ -49,11 +53,14 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
|
|
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
|
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
|
|
|
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
|
|
|
+import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
|
|
import org.apache.hadoop.yarn.util.ControlledClock;
|
|
|
import org.apache.hadoop.yarn.util.Records;
|
|
|
import org.apache.hadoop.yarn.util.SystemClock;
|
|
@@ -82,21 +89,7 @@ public class TestAMRestart {
|
|
|
|
|
|
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
|
|
|
int NUM_CONTAINERS = 3;
|
|
|
- // allocate NUM_CONTAINERS containers
|
|
|
- am1.allocate("127.0.0.1", 1024, NUM_CONTAINERS,
|
|
|
- new ArrayList<ContainerId>());
|
|
|
- nm1.nodeHeartbeat(true);
|
|
|
-
|
|
|
- // wait for containers to be allocated.
|
|
|
- List<Container> containers =
|
|
|
- am1.allocate(new ArrayList<ResourceRequest>(),
|
|
|
- new ArrayList<ContainerId>()).getAllocatedContainers();
|
|
|
- while (containers.size() != NUM_CONTAINERS) {
|
|
|
- nm1.nodeHeartbeat(true);
|
|
|
- containers.addAll(am1.allocate(new ArrayList<ResourceRequest>(),
|
|
|
- new ArrayList<ContainerId>()).getAllocatedContainers());
|
|
|
- Thread.sleep(200);
|
|
|
- }
|
|
|
+ allocateContainers(nm1, am1, NUM_CONTAINERS);
|
|
|
|
|
|
// launch the 2nd container, for testing running container transferred.
|
|
|
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
|
|
@@ -244,6 +237,29 @@ public class TestAMRestart {
|
|
|
rm1.stop();
|
|
|
}
|
|
|
|
|
|
+ private List<Container> allocateContainers(MockNM nm1, MockAM am1,
|
|
|
+ int NUM_CONTAINERS) throws Exception {
|
|
|
+ // allocate NUM_CONTAINERS containers
|
|
|
+ am1.allocate("127.0.0.1", 1024, NUM_CONTAINERS,
|
|
|
+ new ArrayList<ContainerId>());
|
|
|
+ nm1.nodeHeartbeat(true);
|
|
|
+
|
|
|
+ // wait for containers to be allocated.
|
|
|
+ List<Container> containers =
|
|
|
+ am1.allocate(new ArrayList<ResourceRequest>(),
|
|
|
+ new ArrayList<ContainerId>()).getAllocatedContainers();
|
|
|
+ while (containers.size() != NUM_CONTAINERS) {
|
|
|
+ nm1.nodeHeartbeat(true);
|
|
|
+ containers.addAll(am1.allocate(new ArrayList<ResourceRequest>(),
|
|
|
+ new ArrayList<ContainerId>()).getAllocatedContainers());
|
|
|
+ Thread.sleep(200);
|
|
|
+ }
|
|
|
+
|
|
|
+ Assert.assertEquals("Did not get all containers allocated",
|
|
|
+ NUM_CONTAINERS, containers.size());
|
|
|
+ return containers;
|
|
|
+ }
|
|
|
+
|
|
|
private void waitForContainersToFinish(int expectedNum, RMAppAttempt attempt)
|
|
|
throws InterruptedException {
|
|
|
int count = 0;
|
|
@@ -258,6 +274,9 @@ public class TestAMRestart {
|
|
|
public void testNMTokensRebindOnAMRestart() throws Exception {
|
|
|
YarnConfiguration conf = new YarnConfiguration();
|
|
|
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 3);
|
|
|
+ // To prevent test from blacklisting nm1 for AM, we sit threshold to half
|
|
|
+ // of 2 nodes which is 1
|
|
|
+ conf.setFloat(YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD, 0.5f);
|
|
|
|
|
|
MockRM rm1 = new MockRM(conf);
|
|
|
rm1.start();
|
|
@@ -355,6 +374,106 @@ public class TestAMRestart {
|
|
|
rm1.stop();
|
|
|
}
|
|
|
|
|
|
+ @Test(timeout = 100000)
|
|
|
+ public void testAMBlacklistPreventsRestartOnSameNode() throws Exception {
|
|
|
+ YarnConfiguration conf = new YarnConfiguration();
|
|
|
+ conf.setBoolean(YarnConfiguration.AM_BLACKLISTING_ENABLED, true);
|
|
|
+ MemoryRMStateStore memStore = new MemoryRMStateStore();
|
|
|
+ memStore.init(conf);
|
|
|
+ final DrainDispatcher dispatcher = new DrainDispatcher();
|
|
|
+ MockRM rm1 = new MockRM(conf, memStore) {
|
|
|
+ @Override
|
|
|
+ protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
|
|
|
+ return new SchedulerEventDispatcher(this.scheduler) {
|
|
|
+ @Override
|
|
|
+ public void handle(SchedulerEvent event) {
|
|
|
+ scheduler.handle(event);
|
|
|
+ }
|
|
|
+ };
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ protected Dispatcher createDispatcher() {
|
|
|
+ return dispatcher;
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ rm1.start();
|
|
|
+
|
|
|
+ MockNM nm1 =
|
|
|
+ new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
|
|
|
+ nm1.registerNode();
|
|
|
+
|
|
|
+ MockNM nm2 =
|
|
|
+ new MockNM("127.0.0.2:2345", 8000, rm1.getResourceTrackerService());
|
|
|
+ nm2.registerNode();
|
|
|
+
|
|
|
+ RMApp app1 = rm1.submitApp(200);
|
|
|
+
|
|
|
+ MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
|
|
|
+ CapacityScheduler scheduler =
|
|
|
+ (CapacityScheduler) rm1.getResourceScheduler();
|
|
|
+ ContainerId amContainer =
|
|
|
+ ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
|
|
|
+ // Preempt the first attempt;
|
|
|
+ RMContainer rmContainer = scheduler.getRMContainer(amContainer);
|
|
|
+ NodeId nodeWhereAMRan = rmContainer.getAllocatedNode();
|
|
|
+
|
|
|
+ MockNM currentNode, otherNode;
|
|
|
+ if (nodeWhereAMRan == nm1.getNodeId()) {
|
|
|
+ currentNode = nm1;
|
|
|
+ otherNode = nm2;
|
|
|
+ } else {
|
|
|
+ currentNode = nm2;
|
|
|
+ otherNode = nm1;
|
|
|
+ }
|
|
|
+
|
|
|
+ ContainerStatus containerStatus =
|
|
|
+ BuilderUtils.newContainerStatus(amContainer, ContainerState.COMPLETE,
|
|
|
+ "", ContainerExitStatus.DISKS_FAILED);
|
|
|
+ currentNode.containerStatus(containerStatus);
|
|
|
+ am1.waitForState(RMAppAttemptState.FAILED);
|
|
|
+ rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
|
|
|
+
|
|
|
+ // restart the am
|
|
|
+ RMAppAttempt attempt = rm1.waitForAttemptScheduled(app1, rm1);
|
|
|
+ System.out.println("Launch AM " + attempt.getAppAttemptId());
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ currentNode.nodeHeartbeat(true);
|
|
|
+ dispatcher.await();
|
|
|
+ Assert.assertEquals(
|
|
|
+ "AppAttemptState should still be SCHEDULED if currentNode is " +
|
|
|
+ "blacklisted correctly",
|
|
|
+ RMAppAttemptState.SCHEDULED,
|
|
|
+ attempt.getAppAttemptState());
|
|
|
+
|
|
|
+ otherNode.nodeHeartbeat(true);
|
|
|
+ dispatcher.await();
|
|
|
+
|
|
|
+ MockAM am2 = rm1.sendAMLaunched(attempt.getAppAttemptId());
|
|
|
+ rm1.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED);
|
|
|
+
|
|
|
+ amContainer =
|
|
|
+ ContainerId.newContainerId(am2.getApplicationAttemptId(), 1);
|
|
|
+ rmContainer = scheduler.getRMContainer(amContainer);
|
|
|
+ nodeWhereAMRan = rmContainer.getAllocatedNode();
|
|
|
+ Assert.assertEquals(
|
|
|
+ "After blacklisting AM should have run on the other node",
|
|
|
+ otherNode.getNodeId(), nodeWhereAMRan);
|
|
|
+
|
|
|
+ am2.registerAppAttempt();
|
|
|
+ rm1.waitForState(app1.getApplicationId(), RMAppState.RUNNING);
|
|
|
+
|
|
|
+ List<Container> allocatedContainers =
|
|
|
+ allocateContainers(currentNode, am2, 1);
|
|
|
+ Assert.assertEquals(
|
|
|
+ "Even though AM is blacklisted from the node, application can still " +
|
|
|
+ "allocate containers there",
|
|
|
+ currentNode.getNodeId(), allocatedContainers.get(0).getNodeId());
|
|
|
+ }
|
|
|
+
|
|
|
// AM container preempted, nm disk failure
|
|
|
// should not be counted towards AM max retry count.
|
|
|
@Test(timeout = 100000)
|