浏览代码

YARN-3675. FairScheduler: RM quits when node removal races with continuous-scheduling on the same node. (Anubhav Dhoot via kasha)

(cherry picked from commit a8b50e46737c11936ba72c427da69b2365a07aac)
Karthik Kambatla 10 年之前
父节点
当前提交
e8ac88d4fe

+ 4 - 0
hadoop-yarn-project/CHANGES.txt

@@ -464,6 +464,7 @@ Release 2.7.1 - UNRELEASED
     YARN-3493. RM fails to come up with error "Failed to load/recover state" 
     YARN-3493. RM fails to come up with error "Failed to load/recover state" 
     when mem settings are changed. (Jian He via wangda)
     when mem settings are changed. (Jian He via wangda)
 
 
+<<<<<<< HEAD
     YARN-3626. On Windows localized resources are not moved to the front
     YARN-3626. On Windows localized resources are not moved to the front
     of the classpath when they should be. (Craig Welch via xgong)
     of the classpath when they should be. (Craig Welch via xgong)
 
 
@@ -499,6 +500,9 @@ Release 2.7.1 - UNRELEASED
     YARN-3646. Applications are getting stuck some times in case of retry
     YARN-3646. Applications are getting stuck some times in case of retry
     policy forever. (Raju Bairishetti via devaraj)
     policy forever. (Raju Bairishetti via devaraj)
 
 
+    YARN-3675. FairScheduler: RM quits when node removal races with 
+    continuous-scheduling on the same node. (Anubhav Dhoot via kasha)
+
 Release 2.7.0 - 2015-04-20
 Release 2.7.0 - 2015-04-20
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 12 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java

@@ -1039,13 +1039,23 @@ public class FairScheduler extends
               nodes.get(n1).getAvailableResource());
               nodes.get(n1).getAvailableResource());
     }
     }
   }
   }
-  
-  private synchronized void attemptScheduling(FSSchedulerNode node) {
+
+  @VisibleForTesting
+  synchronized void attemptScheduling(FSSchedulerNode node) {
     if (rmContext.isWorkPreservingRecoveryEnabled()
     if (rmContext.isWorkPreservingRecoveryEnabled()
         && !rmContext.isSchedulerReadyForAllocatingContainers()) {
         && !rmContext.isSchedulerReadyForAllocatingContainers()) {
       return;
       return;
     }
     }
 
 
+    final NodeId nodeID = node.getNodeID();
+    if (!nodes.containsKey(nodeID)) {
+      // The node might have just been removed while this thread was waiting
+      // on the synchronized lock before it entered this synchronized method
+      LOG.info("Skipping scheduling as the node " + nodeID +
+          " has been removed");
+      return;
+    }
+
     // Assign new containers...
     // Assign new containers...
     // 1. Check for reserved applications
     // 1. Check for reserved applications
     // 2. Schedule if there are no reservations
     // 2. Schedule if there are no reservations

+ 44 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java

@@ -3889,6 +3889,50 @@ public class TestFairScheduler extends FairSchedulerTestBase {
     }
     }
   }
   }
 
 
+  @Test
+  public void testSchedulingOnRemovedNode() throws Exception {
+    // Disable continuous scheduling, will invoke continuous scheduling manually
+    scheduler.init(conf);
+    scheduler.start();
+    Assert.assertTrue("Continuous scheduling should be disabled.",
+        !scheduler.isContinuousSchedulingEnabled());
+
+    ApplicationAttemptId id11 = createAppAttemptId(1, 1);
+    createMockRMApp(id11);
+
+    scheduler.addApplication(id11.getApplicationId(), "root.queue1", "user1",
+        false);
+    scheduler.addApplicationAttempt(id11, false, false);
+
+    List<ResourceRequest> ask1 = new ArrayList<>();
+    ResourceRequest request1 =
+        createResourceRequest(1024, 8, ResourceRequest.ANY, 1, 1, true);
+
+    ask1.add(request1);
+    scheduler.allocate(id11, ask1, new ArrayList<ContainerId>(), null,
+        null);
+
+    String hostName = "127.0.0.1";
+    RMNode node1 = MockNodes.newNodeInfo(1,
+      Resources.createResource(8 * 1024, 8), 1, hostName);
+    NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1);
+    scheduler.handle(nodeEvent1);
+
+    FSSchedulerNode node = (FSSchedulerNode)scheduler.getSchedulerNode(
+      node1.getNodeID());
+
+    NodeRemovedSchedulerEvent removeNode1 =
+        new NodeRemovedSchedulerEvent(node1);
+    scheduler.handle(removeNode1);
+
+    scheduler.attemptScheduling(node);
+
+    AppAttemptRemovedSchedulerEvent appRemovedEvent1 =
+        new AppAttemptRemovedSchedulerEvent(id11,
+            RMAppAttemptState.FINISHED, false);
+    scheduler.handle(appRemovedEvent1);
+  }
+
   @Test
   @Test
   public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured()
   public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured()
       throws IOException {
       throws IOException {