소스 검색

YARN-3675. FairScheduler: RM quits when node removal races with continuous-scheduling on the same node. (Anubhav Dhoot via kasha)

Karthik Kambatla 10 년 전
부모
커밋
4513761869

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -541,6 +541,9 @@ Release 2.7.1 - UNRELEASED
     YARN-3646. Applications are getting stuck some times in case of retry
     policy forever. (Raju Bairishetti via devaraj)
 
+    YARN-3675. FairScheduler: RM quits when node removal races with 
+    continuous-scheduling on the same node. (Anubhav Dhoot via kasha)
+
 Release 2.7.0 - 2015-04-20
 
   INCOMPATIBLE CHANGES

+ 12 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java

@@ -1039,13 +1039,23 @@ public class FairScheduler extends
               nodes.get(n1).getAvailableResource());
     }
   }
-  
-  private synchronized void attemptScheduling(FSSchedulerNode node) {
+
+  @VisibleForTesting
+  synchronized void attemptScheduling(FSSchedulerNode node) {
     if (rmContext.isWorkPreservingRecoveryEnabled()
         && !rmContext.isSchedulerReadyForAllocatingContainers()) {
       return;
     }
 
+    final NodeId nodeID = node.getNodeID();
+    if (!nodes.containsKey(nodeID)) {
+      // The node might have just been removed while this thread was waiting
+      // on the synchronized lock before it entered this synchronized method
+      LOG.info("Skipping scheduling as the node " + nodeID +
+          " has been removed");
+      return;
+    }
+
     // Assign new containers...
     // 1. Check for reserved applications
     // 2. Schedule if there are no reservations

+ 44 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java

@@ -3889,6 +3889,50 @@ public class TestFairScheduler extends FairSchedulerTestBase {
     }
   }
 
+  @Test
+  public void testSchedulingOnRemovedNode() throws Exception {
+    // Disable continuous scheduling, will invoke continuous scheduling manually
+    scheduler.init(conf);
+    scheduler.start();
+    Assert.assertTrue("Continuous scheduling should be disabled.",
+        !scheduler.isContinuousSchedulingEnabled());
+
+    ApplicationAttemptId id11 = createAppAttemptId(1, 1);
+    createMockRMApp(id11);
+
+    scheduler.addApplication(id11.getApplicationId(), "root.queue1", "user1",
+        false);
+    scheduler.addApplicationAttempt(id11, false, false);
+
+    List<ResourceRequest> ask1 = new ArrayList<>();
+    ResourceRequest request1 =
+        createResourceRequest(1024, 8, ResourceRequest.ANY, 1, 1, true);
+
+    ask1.add(request1);
+    scheduler.allocate(id11, ask1, new ArrayList<ContainerId>(), null,
+        null);
+
+    String hostName = "127.0.0.1";
+    RMNode node1 = MockNodes.newNodeInfo(1,
+      Resources.createResource(8 * 1024, 8), 1, hostName);
+    NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1);
+    scheduler.handle(nodeEvent1);
+
+    FSSchedulerNode node = (FSSchedulerNode)scheduler.getSchedulerNode(
+      node1.getNodeID());
+
+    NodeRemovedSchedulerEvent removeNode1 =
+        new NodeRemovedSchedulerEvent(node1);
+    scheduler.handle(removeNode1);
+
+    scheduler.attemptScheduling(node);
+
+    AppAttemptRemovedSchedulerEvent appRemovedEvent1 =
+        new AppAttemptRemovedSchedulerEvent(id11,
+            RMAppAttemptState.FINISHED, false);
+    scheduler.handle(appRemovedEvent1);
+  }
+
   @Test
   public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured()
       throws IOException {