Prechádzať zdrojové kódy

YARN-3251. Fixed a deadlock in CapacityScheduler when computing absoluteMaxAvailableCapacity in LeafQueue (Craig Welch via wangda)

Wangda Tan 10 rokov pred
rodič
commit
881084fe5c

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -619,6 +619,9 @@ Release 2.6.0 - 2014-11-18
     identifiers to be tampered and thus causing app submission failures in
     identifiers to be tampered and thus causing app submission failures in
     secure mode. (Jian He via vinodkv)
     secure mode. (Jian He via vinodkv)
 
 
+    YARN-3251. Fixed a deadlock in CapacityScheduler when computing 
+    absoluteMaxAvailableCapacity in LeafQueue (Craig Welch via wangda)
+ 
   BREAKDOWN OF YARN-1051 SUBTASKS AND RELATED JIRAS
   BREAKDOWN OF YARN-1051 SUBTASKS AND RELATED JIRAS
 
 
     YARN-1707. Introduce APIs to add/remove/resize queues in the
     YARN-1707. Introduce APIs to add/remove/resize queues in the

+ 17 - 7
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java

@@ -115,6 +115,8 @@ public class LeafQueue extends AbstractCSQueue {
   
   
   private final QueueHeadroomInfo queueHeadroomInfo = new QueueHeadroomInfo();
   private final QueueHeadroomInfo queueHeadroomInfo = new QueueHeadroomInfo();
   
   
+  private volatile float absoluteMaxAvailCapacity;
+  
   public LeafQueue(CapacitySchedulerContext cs, 
   public LeafQueue(CapacitySchedulerContext cs, 
       String queueName, CSQueue parent, CSQueue old) throws IOException {
       String queueName, CSQueue parent, CSQueue old) throws IOException {
     super(cs, queueName, parent, old);
     super(cs, queueName, parent, old);
@@ -133,6 +135,10 @@ public class LeafQueue extends AbstractCSQueue {
         (float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
         (float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
     float absoluteMaxCapacity = 
     float absoluteMaxCapacity = 
         CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
         CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
+        
+    // Initially set to absoluteMax, will be updated to more accurate
+    // max avail value during assignContainers
+    absoluteMaxAvailCapacity = absoluteMaxCapacity;
 
 
     int userLimit = cs.getConfiguration().getUserLimit(getQueuePath());
     int userLimit = cs.getConfiguration().getUserLimit(getQueuePath());
     float userLimitFactor = 
     float userLimitFactor = 
@@ -720,8 +726,18 @@ public class LeafQueue extends AbstractCSQueue {
   }
   }
   
   
   @Override
   @Override
-  public synchronized CSAssignment assignContainers(Resource clusterResource,
+  public CSAssignment assignContainers(Resource clusterResource,
       FiCaSchedulerNode node, boolean needToUnreserve) {
       FiCaSchedulerNode node, boolean needToUnreserve) {
+    //We should not hold a lock on a queue and its parent concurrently - it
+    //can lead to deadlocks when calls which walk down the tree occur
+    //concurrently (getQueueInfo...)
+    absoluteMaxAvailCapacity = CSQueueUtils.getAbsoluteMaxAvailCapacity(
+      resourceCalculator, clusterResource, this);
+    return assignContainersInternal(clusterResource, node, needToUnreserve);
+  }
+  
+  private synchronized CSAssignment assignContainersInternal(
+    Resource clusterResource, FiCaSchedulerNode node, boolean needToUnreserve) {
 
 
     if(LOG.isDebugEnabled()) {
     if(LOG.isDebugEnabled()) {
       LOG.debug("assignContainers: node=" + node.getNodeName()
       LOG.debug("assignContainers: node=" + node.getNodeName()
@@ -1012,12 +1028,6 @@ public class LeafQueue extends AbstractCSQueue {
         computeUserLimit(application, clusterResource, required,
         computeUserLimit(application, clusterResource, required,
             queueUser, requestedLabels);
             queueUser, requestedLabels);
 
 
-    //Max avail capacity needs to take into account usage by ancestor-siblings
-    //which are greater than their base capacity, so we are interested in "max avail"
-    //capacity
-    float absoluteMaxAvailCapacity = CSQueueUtils.getAbsoluteMaxAvailCapacity(
-      resourceCalculator, clusterResource, this);
-
     Resource queueMaxCap =                        // Queue Max-Capacity
     Resource queueMaxCap =                        // Queue Max-Capacity
         Resources.multiplyAndNormalizeDown(
         Resources.multiplyAndNormalizeDown(
             resourceCalculator, 
             resourceCalculator,