瀏覽代碼

YARN-6029. CapacityScheduler deadlock when ParentQueue#getQueueUserAclInfo is called by one thread and LeafQueue#assignContainers is releasing excessive reserved container is called by another thread. (Tao Yang via wangda)

Wangda Tan 8 年之前
父節點
當前提交
ebafe075d6

+ 114 - 99
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java

@@ -895,129 +895,144 @@ public class LeafQueue extends AbstractCSQueue {
   }
   
   @Override
-  public synchronized CSAssignment assignContainers(Resource clusterResource,
+  public CSAssignment assignContainers(Resource clusterResource,
       FiCaSchedulerNode node, ResourceLimits currentResourceLimits,
       SchedulingMode schedulingMode) {
-    updateCurrentResourceLimits(currentResourceLimits, clusterResource);
+    FiCaSchedulerApp reservedApp = null;
+    CSAssignment reservedCSAssignment = null;
 
-    if (LOG.isDebugEnabled()) {
-      LOG.debug("assignContainers: node=" + node.getNodeName()
-          + " #applications=" + orderingPolicy.getNumSchedulableEntities());
-    }
+    synchronized (this) {
+      updateCurrentResourceLimits(currentResourceLimits, clusterResource);
 
-    setPreemptionAllowed(currentResourceLimits, node.getPartition());
+      if (LOG.isDebugEnabled()) {
+        LOG.debug(
+            "assignContainers: node=" + node.getNodeName() + " #applications="
+                + orderingPolicy.getNumSchedulableEntities());
+      }
 
-    // Check for reserved resources
-    RMContainer reservedContainer = node.getReservedContainer();
-    if (reservedContainer != null) {
-      FiCaSchedulerApp application =
-          getApplication(reservedContainer.getApplicationAttemptId());
-      synchronized (application) {
-        CSAssignment assignment =
-            application.assignContainers(clusterResource, node,
-                currentResourceLimits, schedulingMode, reservedContainer);
-        handleExcessReservedContainer(clusterResource, assignment, node,
-            application);
-        killToPreemptContainers(clusterResource, node, assignment);
-        return assignment;
+      setPreemptionAllowed(currentResourceLimits, node.getPartition());
+
+      // Check for reserved resources
+      RMContainer reservedContainer = node.getReservedContainer();
+      if (reservedContainer != null) {
+        reservedApp = getApplication(
+            reservedContainer.getApplicationAttemptId());
+        synchronized (reservedApp) {
+          reservedCSAssignment = reservedApp.assignContainers(
+              clusterResource, node, currentResourceLimits, schedulingMode,
+              reservedContainer);
+        }
       }
     }
 
-    // if our queue cannot access this node, just return
-    if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
-        && !accessibleToPartition(node.getPartition())) {
-      return CSAssignment.NULL_ASSIGNMENT;
+    // Handle possible completedContainer out of synchronized lock to avoid
+    // deadlock.
+    if (reservedCSAssignment != null) {
+      handleExcessReservedContainer(clusterResource, reservedCSAssignment, node,
+          reservedApp);
+      killToPreemptContainers(clusterResource, node, reservedCSAssignment);
+      return reservedCSAssignment;
     }
 
-    // Check if this queue need more resource, simply skip allocation if this
-    // queue doesn't need more resources.
-    if (!hasPendingResourceRequest(node.getPartition(), clusterResource,
-        schedulingMode)) {
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Skip this queue=" + getQueuePath()
-            + ", because it doesn't need more resource, schedulingMode="
-            + schedulingMode.name() + " node-partition=" + node.getPartition());
+    synchronized (this) {
+      // if our queue cannot access this node, just return
+      if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
+          && !accessibleToPartition(node.getPartition())) {
+        return CSAssignment.NULL_ASSIGNMENT;
       }
-      return CSAssignment.NULL_ASSIGNMENT;
-    }
 
-    for (Iterator<FiCaSchedulerApp> assignmentIterator =
-        orderingPolicy.getAssignmentIterator(); assignmentIterator.hasNext();) {
-      FiCaSchedulerApp application = assignmentIterator.next();
-
-      // Check queue max-capacity limit
-      if (!super.canAssignToThisQueue(clusterResource, node.getPartition(),
-          currentResourceLimits, application.getCurrentReservation(),
+      // Check if this queue need more resource, simply skip allocation if this
+      // queue doesn't need more resources.
+      if (!hasPendingResourceRequest(node.getPartition(), clusterResource,
           schedulingMode)) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Skip this queue=" + getQueuePath()
+              + ", because it doesn't need more resource, schedulingMode="
+              + schedulingMode.name() + " node-partition=" + node
+              .getPartition());
+        }
         return CSAssignment.NULL_ASSIGNMENT;
       }
-      
-      Resource userLimit =
-          computeUserLimitAndSetHeadroom(application, clusterResource,
-              node.getPartition(), schedulingMode);
-
-      // Check user limit
-      if (!canAssignToUser(clusterResource, application.getUser(), userLimit,
-          application, node.getPartition(), currentResourceLimits)) {
-        application.updateAMContainerDiagnostics(AMState.ACTIVATED,
-            "User capacity has reached its maximum limit.");
-        continue;
-      }
 
-      // Try to schedule
-      CSAssignment assignment =
-          application.assignContainers(clusterResource, node,
-              currentResourceLimits, schedulingMode, null);
+      for (Iterator<FiCaSchedulerApp> assignmentIterator =
+           orderingPolicy.getAssignmentIterator(); assignmentIterator
+               .hasNext(); ) {
+        FiCaSchedulerApp application = assignmentIterator.next();
 
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("post-assignContainers for application "
-            + application.getApplicationId());
-        application.showRequests();
-      }
+        // Check queue max-capacity limit
+        if (!super.canAssignToThisQueue(clusterResource, node.getPartition(),
+            currentResourceLimits, application.getCurrentReservation(),
+            schedulingMode)) {
+          return CSAssignment.NULL_ASSIGNMENT;
+        }
 
-      // Did we schedule or reserve a container?
-      Resource assigned = assignment.getResource();
-      
-      handleExcessReservedContainer(clusterResource, assignment, node,
-          application);
-      killToPreemptContainers(clusterResource, node, assignment);
+        Resource userLimit =
+            computeUserLimitAndSetHeadroom(application, clusterResource,
+                node.getPartition(), schedulingMode);
 
-      if (Resources.greaterThan(resourceCalculator, clusterResource, assigned,
-          Resources.none())) {
-        // Get reserved or allocated container from application
-        RMContainer reservedOrAllocatedRMContainer =
-            application.getRMContainer(assignment.getAssignmentInformation()
-                .getFirstAllocatedOrReservedContainerId());
+        // Check user limit
+        if (!canAssignToUser(clusterResource, application.getUser(), userLimit,
+            application, node.getPartition(), currentResourceLimits)) {
+          application.updateAMContainerDiagnostics(AMState.ACTIVATED,
+              "User capacity has reached its maximum limit.");
+          continue;
+        }
 
-        // Book-keeping
-        // Note: Update headroom to account for current allocation too...
-        allocateResource(clusterResource, application, assigned,
-            node.getPartition(), reservedOrAllocatedRMContainer,
-            assignment.isIncreasedAllocation());
-
-        // Update reserved metrics
-        Resource reservedRes = assignment.getAssignmentInformation()
-            .getReserved();
-        if (reservedRes != null && !reservedRes.equals(Resources.none())) {
-          incReservedResource(node.getPartition(), reservedRes);
+        // Try to schedule
+        CSAssignment assignment =
+            application.assignContainers(clusterResource, node,
+                currentResourceLimits, schedulingMode, null);
+
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("post-assignContainers for application "
+              + application.getApplicationId());
+          application.showRequests();
         }
 
-        // Done
-        return assignment;
-      } else if (assignment.getSkippedType()
-          == CSAssignment.SkippedType.OTHER) {
-        application.updateNodeInfoForAMDiagnostics(node);
-      } else if(assignment.getSkippedType()
-          == CSAssignment.SkippedType.QUEUE_LIMIT) {
-        return assignment;
-      } else {
-        // If we don't allocate anything, and it is not skipped by application,
-        // we will return to respect FIFO of applications
-        return CSAssignment.NULL_ASSIGNMENT;
+        // Did we schedule or reserve a container?
+        Resource assigned = assignment.getResource();
+
+        handleExcessReservedContainer(clusterResource, assignment, node,
+            application);
+        killToPreemptContainers(clusterResource, node, assignment);
+
+        if (Resources.greaterThan(resourceCalculator, clusterResource, assigned,
+            Resources.none())) {
+          // Get reserved or allocated container from application
+          RMContainer reservedOrAllocatedRMContainer =
+              application.getRMContainer(assignment.getAssignmentInformation()
+                  .getFirstAllocatedOrReservedContainerId());
+
+          // Book-keeping
+          // Note: Update headroom to account for current allocation too...
+          allocateResource(clusterResource, application, assigned,
+              node.getPartition(), reservedOrAllocatedRMContainer,
+              assignment.isIncreasedAllocation());
+
+          // Update reserved metrics
+          Resource reservedRes = assignment.getAssignmentInformation()
+              .getReserved();
+          if (reservedRes != null && !reservedRes.equals(Resources.none())) {
+            incReservedResource(node.getPartition(), reservedRes);
+          }
+
+          // Done
+          return assignment;
+        } else if (assignment.getSkippedType()
+            == CSAssignment.SkippedType.OTHER) {
+          application.updateNodeInfoForAMDiagnostics(node);
+        } else if(assignment.getSkippedType()
+            == CSAssignment.SkippedType.QUEUE_LIMIT) {
+          return assignment;
+        } else {
+          // If we don't allocate anything, and it is not skipped by application,
+          // we will return to respect FIFO of applications
+          return CSAssignment.NULL_ASSIGNMENT;
+        }
       }
-    }
 
-    return CSAssignment.NULL_ASSIGNMENT;
+      return CSAssignment.NULL_ASSIGNMENT;
+    }
   }
 
   protected Resource getHeadroom(User user, Resource queueCurrentLimit,