浏览代码

YARN-1398. Fixed a deadlock in ResourceManager between users requesting queue-acls and completing containers. Contributed by Vinod Kumar Vavilapalli.
svn merge --ignore-ancestry -c 1570415 ../../trunk/


git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1570416 13f79535-47bb-0310-9956-ffa450edef68

Vinod Kumar Vavilapalli 11 年之前
父节点
当前提交
17d43bd735

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -306,6 +306,9 @@ Release 2.4.0 - UNRELEASED
     YARN-713. Fixed ResourceManager to not crash while building tokens when DNS
     YARN-713. Fixed ResourceManager to not crash while building tokens when DNS
     issues happen transmittently. (Jian He via vinodkv)
     issues happen transmittently. (Jian He via vinodkv)
 
 
+    YARN-1398. Fixed a deadlock in ResourceManager between users requesting
+    queue-acls and completing containers. (vinodkv)
+
 Release 2.3.1 - UNRELEASED
 Release 2.3.1 - UNRELEASED
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 8 - 6
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java

@@ -50,7 +50,6 @@ import org.apache.hadoop.yarn.api.records.QueueState;
 import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
 import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.ResourceRequest;
 import org.apache.hadoop.yarn.api.records.ResourceRequest;
-import org.apache.hadoop.yarn.api.records.Token;
 import org.apache.hadoop.yarn.factories.RecordFactory;
 import org.apache.hadoop.yarn.factories.RecordFactory;
 import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
 import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
@@ -1410,12 +1409,14 @@ public class LeafQueue implements CSQueue {
       FiCaSchedulerApp application, FiCaSchedulerNode node, RMContainer rmContainer, 
       FiCaSchedulerApp application, FiCaSchedulerNode node, RMContainer rmContainer, 
       ContainerStatus containerStatus, RMContainerEventType event, CSQueue childQueue) {
       ContainerStatus containerStatus, RMContainerEventType event, CSQueue childQueue) {
     if (application != null) {
     if (application != null) {
+
+      boolean removed = false;
+
       // Careful! Locking order is important!
       // Careful! Locking order is important!
       synchronized (this) {
       synchronized (this) {
 
 
         Container container = rmContainer.getContainer();
         Container container = rmContainer.getContainer();
 
 
-        boolean removed = false;
         // Inform the application & the node
         // Inform the application & the node
         // Note: It's safe to assume that all state changes to RMContainer
         // Note: It's safe to assume that all state changes to RMContainer
         // happen under scheduler's lock... 
         // happen under scheduler's lock... 
@@ -1441,13 +1442,14 @@ public class LeafQueue implements CSQueue {
               " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() +
               " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() +
               " used=" + usedResources +
               " used=" + usedResources +
               " cluster=" + clusterResource);
               " cluster=" + clusterResource);
-          // Inform the parent queue
-          getParent().completedContainer(clusterResource, application,
-              node, rmContainer, null, event, this);
         }
         }
       }
       }
 
 
-
+      if (removed) {
+        // Inform the parent queue _outside_ of the leaf-queue lock
+        getParent().completedContainer(clusterResource, application, node,
+          rmContainer, null, event, this);
+      }
     }
     }
   }
   }