Bladeren bron

YARN-325. RM CapacityScheduler can deadlock when getQueueInfo() is called and a container is completing (Arun C Murthy via tgraves)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23@1431071 13f79535-47bb-0310-9956-ffa450edef68
Thomas Graves 12 jaren geleden
bovenliggende
commit
65eb26683f

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -49,6 +49,9 @@ Release 0.23.6 - UNRELEASED
     YARN-320. RM should always be able to renew its own tokens.
     (Daryn Sharp via sseth)
 
+    YARN-325. RM CapacityScheduler can deadlock when getQueueInfo() is 
+    called and a container is completing (Arun C Murthy via tgraves)
+
 Release 0.23.5 - UNRELEASED
 
   INCOMPATIBLE CHANGES

+ 23 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSAssignment.java

@@ -20,19 +20,33 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
 import org.apache.hadoop.classification.InterfaceAudience.Private;
 import org.apache.hadoop.classification.InterfaceStability.Unstable;
 import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApp;
 
 @Private
 @Unstable
 public class CSAssignment {
   final private Resource resource;
   private NodeType type;
+  private final RMContainer excessReservation;
+  private final SchedulerApp application;
   
   public CSAssignment(Resource resource, NodeType type) {
     this.resource = resource;
     this.type = type;
+    this.application = null;
+    this.excessReservation = null;
+  }
+  
+  public CSAssignment(SchedulerApp application, RMContainer excessReservation) {
+    this.resource = excessReservation.getContainer().getResource();
+    this.type = NodeType.NODE_LOCAL;
+    this.application = application;
+    this.excessReservation = excessReservation;
   }
 
+
   public Resource getResource() {
     return resource;
   }
@@ -45,8 +59,16 @@ public class CSAssignment {
     this.type = type;
   }
   
+  public SchedulerApp getApplication() {
+    return application;
+  }
+
+  public RMContainer getExcessReservation() {
+    return excessReservation;
+  }
+
   @Override
   public String toString() {
     return resource.getMemory() + ":" + type;
   }
-}
+}

+ 14 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java

@@ -583,7 +583,20 @@ implements ResourceScheduler, CapacitySchedulerContext, Configurable {
           reservedApplication.getApplicationId() + " on node: " + nm);
       
       LeafQueue queue = ((LeafQueue)reservedApplication.getQueue());
-      queue.assignContainers(clusterResource, node);
+      CSAssignment assignment = queue.assignContainers(clusterResource, node);
+      
+      RMContainer excessReservation = assignment.getExcessReservation();
+      if (excessReservation != null) {
+      Container container = excessReservation.getContainer();
+      queue.completedContainer(
+          clusterResource, assignment.getApplication(), node, 
+          excessReservation, 
+          SchedulerUtils.createAbnormalContainerStatus(
+              container.getId(), 
+              SchedulerUtils.UNRESERVED_CONTAINER), 
+          RMContainerEventType.RELEASED);
+      }
+
     }
 
     // Try to schedule more if there are no reservations to fulfill

+ 7 - 15
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java

@@ -760,11 +760,9 @@ public class LeafQueue implements CSQueue {
     if (reservedContainer != null) {
       SchedulerApp application = 
           getApplication(reservedContainer.getApplicationAttemptId());
-      return new CSAssignment(
+      return 
           assignReservedContainer(application, node, reservedContainer, 
-              clusterResource),
-          NodeType.NODE_LOCAL); // Don't care about locality constraints 
-                                // for reserved containers
+              clusterResource); 
     }
     
     // Try to assign containers to applications in order
@@ -850,20 +848,14 @@ public class LeafQueue implements CSQueue {
 
   }
 
-  private synchronized Resource assignReservedContainer(SchedulerApp application, 
-      SchedulerNode node, RMContainer rmContainer, Resource clusterResource) {
+  private synchronized CSAssignment assignReservedContainer(
+      SchedulerApp application, SchedulerNode node, RMContainer rmContainer, 
+      Resource clusterResource) {
     // Do we still need this reservation?
     Priority priority = rmContainer.getReservedPriority();
     if (application.getTotalRequiredResources(priority) == 0) {
       // Release
-      Container container = rmContainer.getContainer();
-      completedContainer(clusterResource, application, node, 
-          rmContainer, 
-          SchedulerUtils.createAbnormalContainerStatus(
-              container.getId(), 
-              SchedulerUtils.UNRESERVED_CONTAINER), 
-          RMContainerEventType.RELEASED);
-      return container.getResource(); // Ugh, return resource to force re-sort
+      return new CSAssignment(application, rmContainer);
     }
 
     // Try to assign if we have sufficient resources
@@ -872,7 +864,7 @@ public class LeafQueue implements CSQueue {
     
     // Doesn't matter... since it's already charged for at time of reservation
     // "re-reservation" is *free*
-    return org.apache.hadoop.yarn.server.resourcemanager.resource.Resource.NONE;
+    return new CSAssignment(Resources.none(), NodeType.NODE_LOCAL);
   }
 
   private synchronized boolean assignToQueue(Resource clusterResource, 

+ 5 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java

@@ -1164,12 +1164,14 @@ public class TestLeafQueue {
     // Now finish another container from app_0 and see the reservation cancelled
     a.completedContainer(clusterResource, app_0, node_0, 
         app_0.getLiveContainers().iterator().next(), null, RMContainerEventType.KILL);
-    a.assignContainers(clusterResource, node_0);
-    assertEquals(4*GB, a.getUsedResources().getMemory());
+    CSAssignment assignment = a.assignContainers(clusterResource, node_0);
+    assertEquals(8*GB, a.getUsedResources().getMemory());
     assertEquals(0*GB, app_0.getCurrentConsumption().getMemory());
     assertEquals(4*GB, app_1.getCurrentConsumption().getMemory());
-    assertEquals(0*GB, app_1.getCurrentReservation().getMemory());
+    assertEquals(4*GB, app_1.getCurrentReservation().getMemory());
     assertEquals(0*GB, node_0.getUsedResource().getMemory());
+    assertEquals(4*GB, 
+        assignment.getExcessReservation().getContainer().getResource().getMemory());
   }