Ver código fonte

YARN-5864. Capacity Scheduler - Queue Priorities. (wangda)

Wangda Tan 8 anos atrás
pai
commit
ce832059db
37 arquivos alterados com 2764 adições e 231 exclusões
  1. 0 4
      hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml
  2. 7 7
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/AbstractPreemptableResourceCalculator.java
  3. 2 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptionCandidatesSelector.java
  4. 37 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java
  5. 510 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/QueuePriorityContainerCandidateSelector.java
  6. 30 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempQueuePerPartition.java
  7. 120 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempSchedulerNode.java
  8. 6 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java
  9. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java
  10. 10 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java
  11. 7 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java
  12. 65 10
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
  13. 212 24
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java
  14. 0 4
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerContext.java
  15. 0 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerQueueManager.java
  16. 2 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
  17. 39 31
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
  18. 0 72
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/PartitionedQueueComparator.java
  19. 186 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/policy/PriorityUtilizationQueueOrderingPolicy.java
  20. 52 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/policy/QueueOrderingPolicy.java
  21. 89 7
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java
  22. 64 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java
  23. 361 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestPreemptionForQueueWithPriorities.java
  24. 21 10
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicy.java
  25. 5 6
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyForNodePartitions.java
  26. 10 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyMockFramework.java
  27. 19 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerPreemptionTestBase.java
  28. 0 9
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java
  29. 0 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimitsByPartition.java
  30. 570 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerSurgicalPreemption.java
  31. 0 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestChildQueueOrder.java
  32. 111 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerAllocation.java
  33. 5 8
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java
  34. 0 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestParentQueue.java
  35. 0 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestReservations.java
  36. 222 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/policy/TestPriorityUtilizationQueueOrderingPolicy.java
  37. 1 10
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/policy/TestFairOrderingPolicy.java

+ 0 - 4
hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml

@@ -183,10 +183,6 @@
     <Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.RecoveryComparator" />
     <Bug pattern="SE_COMPARATOR_SHOULD_BE_SERIALIZABLE" />
   </Match>
-  <Match>
-    <Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.PartitionedQueueComparator" />
-    <Bug pattern="SE_COMPARATOR_SHOULD_BE_SERIALIZABLE" />
-  </Match>
   <!-- Ignore some irrelevant class name warning -->
   <Match>
     <Class name="org.apache.hadoop.yarn.api.records.SerializedException" />

+ 7 - 7
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/AbstractPreemptableResourceCalculator.java

@@ -19,6 +19,7 @@
 package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity;
 
 import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.policy.PriorityUtilizationQueueOrderingPolicy;
 import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
@@ -49,13 +50,11 @@ public class AbstractPreemptableResourceCalculator {
 
     @Override
     public int compare(TempQueuePerPartition tq1, TempQueuePerPartition tq2) {
-      if (getIdealPctOfGuaranteed(tq1) < getIdealPctOfGuaranteed(tq2)) {
-        return -1;
-      }
-      if (getIdealPctOfGuaranteed(tq1) > getIdealPctOfGuaranteed(tq2)) {
-        return 1;
-      }
-      return 0;
+      double assigned1 = getIdealPctOfGuaranteed(tq1);
+      double assigned2 = getIdealPctOfGuaranteed(tq2);
+
+      return PriorityUtilizationQueueOrderingPolicy.compare(assigned1,
+          assigned2, tq1.relativePriority, tq2.relativePriority);
     }
 
     // Calculates idealAssigned / guaranteed
@@ -156,6 +155,7 @@ public class AbstractPreemptableResourceCalculator {
       // way, the most underserved queue(s) are always given resources first.
       Collection<TempQueuePerPartition> underserved = getMostUnderservedQueues(
           orderedByNeed, tqComparator);
+
       for (Iterator<TempQueuePerPartition> i = underserved.iterator(); i
           .hasNext();) {
         TempQueuePerPartition sub = i.next();

+ 2 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptionCandidatesSelector.java

@@ -48,7 +48,8 @@ public abstract class PreemptionCandidatesSelector {
    * @param selectedCandidates already selected candidates from previous policies
    * @param clusterResource total resource
    * @param totalPreemptedResourceAllowed how many resources allowed to be
-   *                                      preempted in this round
+   *                                      preempted in this round. Should be
+   *                                      updated(in-place set) after the call
    * @return merged selected candidates.
    */
   public abstract Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(

+ 37 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java

@@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableSet;
+import org.apache.commons.lang.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -193,6 +194,14 @@ public class ProportionalCapacityPreemptionPolicy
     rc = scheduler.getResourceCalculator();
     nlm = scheduler.getRMContext().getNodeLabelManager();
 
+    // Do we need white queue-priority preemption policy?
+    boolean isQueuePriorityPreemptionEnabled =
+        csConfig.getPUOrderingPolicyUnderUtilizedPreemptionEnabled();
+    if (isQueuePriorityPreemptionEnabled) {
+      candidatesSelectionPolicies.add(
+          new QueuePriorityContainerCandidateSelector(this));
+    }
+
     // Do we need to specially consider reserved containers?
     boolean selectCandidatesForResevedContainers = csConfig.getBoolean(
         CapacitySchedulerConfiguration.
@@ -352,6 +361,8 @@ public class ProportionalCapacityPreemptionPolicy
                 .clone(nlm.getResourceByLabel(partitionToLookAt, clusterResources)),
             partitionToLookAt);
       }
+
+      // Update effective priority of queues
     }
 
     this.leafQueueNames = ImmutableSet.copyOf(getLeafQueueNames(
@@ -368,13 +379,28 @@ public class ProportionalCapacityPreemptionPolicy
         new HashMap<>();
     for (PreemptionCandidatesSelector selector :
         candidatesSelectionPolicies) {
+      long startTime = 0;
       if (LOG.isDebugEnabled()) {
         LOG.debug(MessageFormat
             .format("Trying to use {0} to select preemption candidates",
                 selector.getClass().getName()));
+        startTime = clock.getTime();
       }
       toPreempt = selector.selectCandidates(toPreempt,
           clusterResources, totalPreemptionAllowed);
+
+      if (LOG.isDebugEnabled()) {
+        LOG.debug(MessageFormat
+            .format("{0} uses {1} millisecond to run",
+                selector.getClass().getName(), clock.getTime() - startTime));
+        int totalSelected = 0;
+        for (Set<RMContainer> set : toPreempt.values()) {
+          totalSelected += set.size();
+        }
+        LOG.debug(MessageFormat
+            .format("So far, total {0} containers selected to be preempted",
+                totalSelected));
+      }
     }
 
     if (LOG.isDebugEnabled()) {
@@ -470,11 +496,22 @@ public class ProportionalCapacityPreemptionPolicy
           reserved, curQueue);
 
       if (curQueue instanceof ParentQueue) {
+        String configuredOrderingPolicy =
+            ((ParentQueue) curQueue).getQueueOrderingPolicy().getConfigName();
+
         // Recursively add children
         for (CSQueue c : curQueue.getChildQueues()) {
           TempQueuePerPartition subq = cloneQueues(c, partitionResource,
               partitionToLookAt);
+
+          // If we respect priority
+          if (StringUtils.equals(
+              CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY,
+              configuredOrderingPolicy)) {
+            subq.relativePriority = c.getPriority().getPriority();
+          }
           ret.addChild(subq);
+          subq.parent = ret;
         }
       }
     } finally {

+ 510 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/QueuePriorityContainerCandidateSelector.java

@@ -0,0 +1,510 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity;
+
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.Table;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.NodeId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceRequest;
+import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
+import org.apache.hadoop.yarn.util.resource.Resources;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+public class QueuePriorityContainerCandidateSelector
+    extends PreemptionCandidatesSelector {
+  private static final Log LOG =
+      LogFactory.getLog(QueuePriorityContainerCandidateSelector.class);
+
+  // Configured timeout before doing reserved container preemption
+  private long minTimeout;
+
+  // Allow move reservation around for better placement?
+  private boolean allowMoveReservation;
+
+  // All the reserved containers of the system which could possible preempt from
+  // queue with lower priorities
+  private List<RMContainer> reservedContainers;
+
+  // From -> To
+  // A digraph to represent if one queue has higher priority than another.
+  // For example, a->b means queue=a has higher priority than queue=b
+  private Table<String, String, Boolean> priorityDigraph =
+      HashBasedTable.create();
+
+  private Resource clusterResource;
+  private Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates;
+  private Resource totalPreemptionAllowed;
+
+  // A cached scheduler node map, will be refreshed each round.
+  private Map<NodeId, TempSchedulerNode> tempSchedulerNodeMap = new HashMap<>();
+
+  // Have we touched (make any changes to the node) for this round
+  // Once a node is touched, we will not try to move reservations to the node
+  private Set<NodeId> touchedNodes;
+
+  // Resource which marked to preempt from other queues.
+  // <Queue, Partition, Resource-marked-to-be-preempted-from-other-queue>
+  private Table<String, String, Resource> toPreemptedFromOtherQueues =
+      HashBasedTable.create();
+
+  private final Comparator<RMContainer>
+      CONTAINER_CREATION_TIME_COMPARATOR = new Comparator<RMContainer>() {
+    @Override
+    public int compare(RMContainer o1, RMContainer o2) {
+      if (preemptionAllowed(o1.getQueueName(), o2.getQueueName())) {
+        return -1;
+      } else if (preemptionAllowed(o2.getQueueName(), o1.getQueueName())) {
+        return 1;
+      }
+
+      // If two queues cannot preempt each other, compare creation time.
+      return Long.compare(o1.getCreationTime(), o2.getCreationTime());
+    }
+  };
+
+  QueuePriorityContainerCandidateSelector(
+      CapacitySchedulerPreemptionContext preemptionContext) {
+    super(preemptionContext);
+
+    // Initialize parameters
+    CapacitySchedulerConfiguration csc =
+        preemptionContext.getScheduler().getConfiguration();
+
+    minTimeout = csc.getPUOrderingPolicyUnderUtilizedPreemptionDelay();
+    allowMoveReservation =
+        csc.getPUOrderingPolicyUnderUtilizedPreemptionMoveReservation();
+  }
+
+  private List<TempQueuePerPartition> getPathToRoot(TempQueuePerPartition tq) {
+    List<TempQueuePerPartition> list = new ArrayList<>();
+    while (tq != null) {
+      list.add(tq);
+      tq = tq.parent;
+    }
+    return list;
+  }
+
+  private void intializePriorityDigraph() {
+    LOG.info("Initializing priority preemption directed graph:");
+
+    // Make sure we iterate all leaf queue combinations
+    for (String q1 : preemptionContext.getLeafQueueNames()) {
+      for (String q2 : preemptionContext.getLeafQueueNames()) {
+        // Make sure we only calculate each combination once instead of all
+        // permutations
+        if (q1.compareTo(q2) < 0) {
+          TempQueuePerPartition tq1 = preemptionContext.getQueueByPartition(q1,
+              RMNodeLabelsManager.NO_LABEL);
+          TempQueuePerPartition tq2 = preemptionContext.getQueueByPartition(q2,
+              RMNodeLabelsManager.NO_LABEL);
+
+          List<TempQueuePerPartition> path1 = getPathToRoot(tq1);
+          List<TempQueuePerPartition> path2 = getPathToRoot(tq2);
+
+          // Get direct ancestor below LCA (Lowest common ancestor)
+          int i = path1.size() - 1;
+          int j = path2.size() - 1;
+          while (path1.get(i).queueName.equals(path2.get(j).queueName)) {
+            i--;
+            j--;
+          }
+
+          // compare priority of path1[i] and path2[j]
+          int p1 = path1.get(i).relativePriority;
+          int p2 = path2.get(j).relativePriority;
+          if (p1 < p2) {
+            priorityDigraph.put(q2, q1, true);
+            if (LOG.isDebugEnabled()) {
+              LOG.info("- Added priority ordering edge: " + q2 + " >> " + q1);
+            }
+          } else if (p2 < p1) {
+            priorityDigraph.put(q1, q2, true);
+            if (LOG.isDebugEnabled()) {
+              LOG.info("- Added priority ordering edge: " + q1 + " >> " + q2);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Do we allow demandingQueue preempt resource from toBePreemptedQueue
+   *
+   * @param demandingQueue demandingQueue
+   * @param toBePreemptedQueue toBePreemptedQueue
+   * @return can/cannot
+   */
+  private boolean preemptionAllowed(String demandingQueue,
+      String toBePreemptedQueue) {
+    return priorityDigraph.contains(demandingQueue,
+        toBePreemptedQueue);
+  }
+
+  /**
+   * Can we preempt enough resource for given:
+   *
+   * @param requiredResource askedResource
+   * @param demandingQueue demandingQueue
+   * @param schedulerNode node
+   * @param lookingForNewReservationPlacement Are we trying to look for move
+   *        reservation to the node
+   * @param newlySelectedContainers newly selected containers, will be set when
+   *        we can preempt enough resources from the node.
+   *
+   * @return can/cannot
+   */
+  private boolean canPreemptEnoughResourceForAsked(Resource requiredResource,
+      String demandingQueue, FiCaSchedulerNode schedulerNode,
+      boolean lookingForNewReservationPlacement,
+      List<RMContainer> newlySelectedContainers) {
+    // Do not check touched nodes again.
+    if (touchedNodes.contains(schedulerNode.getNodeID())) {
+      return false;
+    }
+
+    TempSchedulerNode node = tempSchedulerNodeMap.get(schedulerNode.getNodeID());
+    if (null == node) {
+      node = TempSchedulerNode.fromSchedulerNode(schedulerNode);
+      tempSchedulerNodeMap.put(schedulerNode.getNodeID(), node);
+    }
+
+    if (null != schedulerNode.getReservedContainer()
+        && lookingForNewReservationPlacement) {
+      // Node reserved by the others, skip this node
+      // We will not try to move the reservation to node which reserved already.
+      return false;
+    }
+
+    // Need to preemption = asked - (node.total - node.allocated)
+    Resource lacking = Resources.subtract(requiredResource, Resources
+        .subtract(node.getTotalResource(), node.getAllocatedResource()));
+
+    // On each host, simply check if we could preempt containers from
+    // lower-prioritized queues or not
+    List<RMContainer> runningContainers = node.getRunningContainers();
+    Collections.sort(runningContainers, CONTAINER_CREATION_TIME_COMPARATOR);
+
+    // First of all, consider already selected containers
+    for (RMContainer runningContainer : runningContainers) {
+      if (CapacitySchedulerPreemptionUtils.isContainerAlreadySelected(
+          runningContainer, selectedCandidates)) {
+        Resources.subtractFrom(lacking,
+            runningContainer.getAllocatedResource());
+      }
+    }
+
+    // If we already can allocate the reserved container after preemption,
+    // skip following steps
+    if (Resources.fitsIn(rc, clusterResource, lacking,
+        Resources.none())) {
+      return true;
+    }
+
+    Resource allowed = Resources.clone(totalPreemptionAllowed);
+    Resource selected = Resources.createResource(0);
+
+    for (RMContainer runningContainer : runningContainers) {
+      if (CapacitySchedulerPreemptionUtils.isContainerAlreadySelected(
+          runningContainer, selectedCandidates)) {
+        // ignore selected containers
+        continue;
+      }
+
+      // Only preempt resource from queue with lower priority
+      if (!preemptionAllowed(demandingQueue,
+          runningContainer.getQueueName())) {
+        continue;
+      }
+
+      // Don't preempt AM container
+      if (runningContainer.isAMContainer()) {
+        continue;
+      }
+
+      // Not allow to preempt more than limit
+      if (Resources.greaterThanOrEqual(rc, clusterResource, allowed,
+          runningContainer.getAllocatedResource())) {
+        Resources.subtractFrom(allowed,
+            runningContainer.getAllocatedResource());
+        Resources.subtractFrom(lacking,
+            runningContainer.getAllocatedResource());
+        Resources.addTo(selected, runningContainer.getAllocatedResource());
+
+        if (null != newlySelectedContainers) {
+          newlySelectedContainers.add(runningContainer);
+        }
+      }
+
+      // Lacking <= 0 means we can allocate the reserved container
+      if (Resources.fitsIn(rc, clusterResource, lacking, Resources.none())) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  private boolean preChecksForMovingReservedContainerToNode(
+      RMContainer reservedContainer, FiCaSchedulerNode newNode) {
+    // Don't do this if it has hard-locality preferences
+    if (reservedContainer.getReservedSchedulerKey().getContainerToUpdate()
+        != null) {
+      // This means a container update request (like increase / promote)
+      return false;
+    }
+
+    // For normal requests
+    FiCaSchedulerApp app =
+        preemptionContext.getScheduler().getApplicationAttempt(
+            reservedContainer.getApplicationAttemptId());
+    if (!app.getAppSchedulingInfo().canDelayTo(
+        reservedContainer.getAllocatedSchedulerKey(), ResourceRequest.ANY)) {
+      // This is a hard locality request
+      return false;
+    }
+
+    // Check if newNode's partition matches requested partition
+    if (!StringUtils.equals(reservedContainer.getNodeLabelExpression(),
+        newNode.getPartition())) {
+      return false;
+    }
+
+    return true;
+  }
+
+  private void tryToMakeBetterReservationPlacement(
+      RMContainer reservedContainer,
+      List<FiCaSchedulerNode> allSchedulerNodes) {
+    for (FiCaSchedulerNode targetNode : allSchedulerNodes) {
+      // Precheck if we can move the rmContainer to the new targetNode
+      if (!preChecksForMovingReservedContainerToNode(reservedContainer,
+          targetNode)) {
+        continue;
+      }
+
+      if (canPreemptEnoughResourceForAsked(
+          reservedContainer.getReservedResource(),
+          reservedContainer.getQueueName(), targetNode, true, null)) {
+        NodeId fromNode = reservedContainer.getNodeId();
+
+        // We can place container to this targetNode, so just go ahead and notify
+        // scheduler
+        if (preemptionContext.getScheduler().moveReservedContainer(
+            reservedContainer, targetNode)) {
+          LOG.info("Successfully moved reserved container=" + reservedContainer
+              .getContainerId() + " from targetNode=" + fromNode
+              + " to targetNode=" + targetNode.getNodeID());
+          touchedNodes.add(targetNode.getNodeID());
+        }
+      }
+    }
+  }
+
+  /**
+   * Do we allow the demanding queue preempt resource from other queues?
+   * A satisfied queue is not allowed to preempt resource from other queues.
+   * @param demandingQueue
+   * @return allowed/not
+   */
+  private boolean isQueueSatisfied(String demandingQueue,
+      String partition) {
+    TempQueuePerPartition tq = preemptionContext.getQueueByPartition(
+        demandingQueue, partition);
+    if (null == tq) {
+      return false;
+    }
+
+    Resource guaranteed = tq.getGuaranteed();
+    Resource usedDeductReservd = Resources.subtract(tq.getUsed(),
+        tq.getReserved());
+    Resource markedToPreemptFromOtherQueue = toPreemptedFromOtherQueues.get(
+        demandingQueue, partition);
+    if (null == markedToPreemptFromOtherQueue) {
+      markedToPreemptFromOtherQueue = Resources.none();
+    }
+
+    // return Used - reserved + to-preempt-from-other-queue >= guaranteed
+    boolean flag = Resources.greaterThanOrEqual(rc, clusterResource,
+        Resources.add(usedDeductReservd, markedToPreemptFromOtherQueue),
+        guaranteed);
+    return flag;
+  }
+
+  private void incToPreempt(String queue, String partition,
+      Resource allocated) {
+    Resource total = toPreemptedFromOtherQueues.get(queue, partition);
+    if (null == total) {
+      total = Resources.createResource(0);
+      toPreemptedFromOtherQueues.put(queue, partition, total);
+    }
+
+    Resources.addTo(total, allocated);
+  }
+
+  @Override
+  public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
+      Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates,
+      Resource clusterResource,
+      Resource totalPreemptedResourceAllowed) {
+    // Initialize digraph from queues
+    // TODO (wangda): only do this when queue refreshed.
+    priorityDigraph.clear();
+    intializePriorityDigraph();
+
+    // When all queues are set to same priority, or priority is not respected,
+    // direct return.
+    if (priorityDigraph.isEmpty()) {
+      return selectedCandidates;
+    }
+
+    // Save parameters to be shared by other methods
+    this.selectedCandidates = selectedCandidates;
+    this.clusterResource = clusterResource;
+    this.totalPreemptionAllowed = totalPreemptedResourceAllowed;
+
+    toPreemptedFromOtherQueues.clear();
+
+    reservedContainers = new ArrayList<>();
+
+    // Clear temp-scheduler-node-map every time when doing selection of
+    // containers.
+    tempSchedulerNodeMap.clear();
+    touchedNodes = new HashSet<>();
+
+    // Add all reserved containers for analysis
+    List<FiCaSchedulerNode> allSchedulerNodes =
+        preemptionContext.getScheduler().getAllNodes();
+    for (FiCaSchedulerNode node : allSchedulerNodes) {
+      RMContainer reservedContainer = node.getReservedContainer();
+      if (null != reservedContainer) {
+        // Add to reservedContainers list if the queue that the reserved
+        // container belongs to has high priority than at least one queue
+        if (priorityDigraph.containsRow(
+            reservedContainer.getQueueName())) {
+          reservedContainers.add(reservedContainer);
+        }
+      }
+    }
+
+    // Sort reserved container by creation time
+    Collections.sort(reservedContainers, CONTAINER_CREATION_TIME_COMPARATOR);
+
+    long currentTime = System.currentTimeMillis();
+
+    // From the begining of the list
+    for (RMContainer reservedContainer : reservedContainers) {
+      // Only try to preempt reserved container after reserved container created
+      // and cannot be allocated after minTimeout
+      if (currentTime - reservedContainer.getCreationTime() < minTimeout) {
+        continue;
+      }
+
+      FiCaSchedulerNode node = preemptionContext.getScheduler().getNode(
+          reservedContainer.getReservedNode());
+      if (null == node) {
+        // Something is wrong, ignore
+        continue;
+      }
+
+      List<RMContainer> newlySelectedToBePreemptContainers = new ArrayList<>();
+
+      // Check if we can preempt for this queue
+      // We will skip if the demanding queue is already satisfied.
+      String demandingQueueName = reservedContainer.getQueueName();
+      boolean demandingQueueSatisfied = isQueueSatisfied(demandingQueueName,
+          node.getPartition());
+
+      // We will continue check if it is possible to preempt reserved container
+      // from the node.
+      boolean canPreempt = false;
+      if (!demandingQueueSatisfied) {
+        canPreempt = canPreemptEnoughResourceForAsked(
+            reservedContainer.getReservedResource(), demandingQueueName, node,
+            false, newlySelectedToBePreemptContainers);
+      }
+
+      // Add selected container if we can allocate reserved container by
+      // preemption others
+      if (canPreempt) {
+        touchedNodes.add(node.getNodeID());
+
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Trying to preempt following containers to make reserved "
+              + "container=" + reservedContainer.getContainerId() + " on node="
+              + node.getNodeID() + " can be allocated:");
+        }
+
+        // Update to-be-preempt
+        incToPreempt(demandingQueueName, node.getPartition(),
+            reservedContainer.getReservedResource());
+
+        for (RMContainer c : newlySelectedToBePreemptContainers) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(" --container=" + c.getContainerId() + " resource=" + c
+                .getReservedResource());
+          }
+
+          Set<RMContainer> containers = selectedCandidates.get(
+              c.getApplicationAttemptId());
+          if (null == containers) {
+            containers = new HashSet<>();
+            selectedCandidates.put(c.getApplicationAttemptId(), containers);
+          }
+          containers.add(c);
+
+          // Update totalPreemptionResourceAllowed
+          Resources.subtractFrom(totalPreemptedResourceAllowed,
+              c.getAllocatedResource());
+        }
+      } else if (!demandingQueueSatisfied) {
+        // We failed to get enough resource to allocate the container
+        // This typically happens when the reserved node is proper, will
+        // try to see if we can reserve the container on a better host.
+        // Only do this if the demanding queue is not satisfied.
+        //
+        // TODO (wangda): do more tests before making it usable
+        //
+        if (allowMoveReservation) {
+          tryToMakeBetterReservationPlacement(reservedContainer,
+              allSchedulerNodes);
+        }
+      }
+    }
+
+    return selectedCandidates;
+  }
+}

+ 30 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempQueuePerPartition.java

@@ -53,6 +53,12 @@ public class TempQueuePerPartition extends AbstractPreemptionEntity {
 
   protected Resource pendingDeductReserved;
 
+  // Relative priority of this queue to its parent
+  // If parent queue's ordering policy doesn't respect priority,
+  // this will be always 0
+  int relativePriority = 0;
+  TempQueuePerPartition parent = null;
+
   TempQueuePerPartition(String queueName, Resource current,
       boolean preemptionDisabled, String partition, Resource killable,
       float absCapacity, float absMaxCapacity, Resource totalPartitionResource,
@@ -114,8 +120,15 @@ public class TempQueuePerPartition extends AbstractPreemptionEntity {
     Resource absMaxCapIdealAssignedDelta = Resources.componentwiseMax(
         Resources.subtract(getMax(), idealAssigned),
         Resource.newInstance(0, 0));
-    // remain = avail - min(avail, (max - assigned), (current + pending -
-    // assigned))
+    // accepted = min{avail,
+    //               max - assigned,
+    //               current + pending - assigned,
+    //               # Make sure a queue will not get more than max of its
+    //               # used/guaranteed, this is to make sure preemption won't
+    //               # happen if all active queues are beyond their guaranteed
+    //               # This is for leaf queue only.
+    //               max(guaranteed, used) - assigned}
+    // remain = avail - accepted
     Resource accepted = Resources.min(rc, clusterResource,
         absMaxCapIdealAssignedDelta,
         Resources.min(rc, clusterResource, avail, Resources
@@ -137,6 +150,21 @@ public class TempQueuePerPartition extends AbstractPreemptionEntity {
             .subtract(Resources.add(getUsed(),
                 (considersReservedResource ? pending : pendingDeductReserved)),
                 idealAssigned)));
+
+    // For leaf queue: accept = min(accept, max(guaranteed, used) - assigned)
+    // Why only for leaf queue?
+    // Because for a satisfied parent queue, it could have some under-utilized
+    // leaf queues. Such under-utilized leaf queue could preemption resources
+    // from over-utilized leaf queue located at other hierarchies.
+    if (null == children || children.isEmpty()) {
+      Resource maxOfGuranteedAndUsedDeductAssigned = Resources.subtract(
+          Resources.max(rc, clusterResource, getUsed(), getGuaranteed()),
+          idealAssigned);
+      maxOfGuranteedAndUsedDeductAssigned = Resources.max(rc, clusterResource,
+          maxOfGuranteedAndUsedDeductAssigned, Resources.none());
+      accepted = Resources.min(rc, clusterResource, accepted,
+          maxOfGuranteedAndUsedDeductAssigned);
+    }
     Resource remain = Resources.subtract(avail, accepted);
     Resources.addTo(idealAssigned, accepted);
     return remain;

+ 120 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempSchedulerNode.java

@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity;
+
+import org.apache.hadoop.yarn.api.records.NodeId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
+import org.apache.hadoop.yarn.util.resource.Resources;
+
+import java.util.List;
+
+/**
+ * This class will save necessary information which copied from
+ * FiCaSchedulerNode. This is added majorly for performance consideration, this
+ * can be cached to avoid hitting scheduler again and again. In addition,
+ * we can add some preemption-required fields to the class.
+ */
+public class TempSchedulerNode {
+  private List<RMContainer> runningContainers;
+  private RMContainer reservedContainer;
+  private Resource totalResource;
+
+  // excluded reserved resource
+  private Resource allocatedResource;
+
+  // total - allocated
+  private Resource availableResource;
+
+  // just a shortcut of reservedContainer.getResource.
+  private Resource reservedResource;
+
+  private NodeId nodeId;
+
+  public static TempSchedulerNode fromSchedulerNode(
+      FiCaSchedulerNode schedulerNode) {
+    TempSchedulerNode n = new TempSchedulerNode();
+    n.totalResource = Resources.clone(schedulerNode.getTotalResource());
+    n.allocatedResource = Resources.clone(schedulerNode.getAllocatedResource());
+    n.runningContainers = schedulerNode.getCopiedListOfRunningContainers();
+    n.reservedContainer = schedulerNode.getReservedContainer();
+    if (n.reservedContainer != null) {
+      n.reservedResource = n.reservedContainer.getReservedResource();
+    } else {
+      n.reservedResource = Resources.none();
+    }
+    n.availableResource = Resources.subtract(n.totalResource,
+        n.allocatedResource);
+    n.nodeId = schedulerNode.getNodeID();
+    return n;
+  }
+
+  public NodeId getNodeId() {
+    return nodeId;
+  }
+
+  public List<RMContainer> getRunningContainers() {
+    return runningContainers;
+  }
+
+  public void setRunningContainers(List<RMContainer> runningContainers) {
+    this.runningContainers = runningContainers;
+  }
+
+  public RMContainer getReservedContainer() {
+    return reservedContainer;
+  }
+
+  public void setReservedContainer(RMContainer reservedContainer) {
+    this.reservedContainer = reservedContainer;
+  }
+
+  public Resource getTotalResource() {
+    return totalResource;
+  }
+
+  public void setTotalResource(Resource totalResource) {
+    this.totalResource = totalResource;
+  }
+
+  public Resource getAllocatedResource() {
+    return allocatedResource;
+  }
+
+  public void setAllocatedResource(Resource allocatedResource) {
+    this.allocatedResource = allocatedResource;
+  }
+
+  public Resource getAvailableResource() {
+    return availableResource;
+  }
+
+  public void setAvailableResource(Resource availableResource) {
+    this.availableResource = availableResource;
+  }
+
+  public Resource getReservedResource() {
+    return reservedResource;
+  }
+
+  public void setReservedResource(Resource reservedResource) {
+    this.reservedResource = reservedResource;
+  }
+}

+ 6 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java

@@ -556,7 +556,12 @@ public class RMContainerImpl implements RMContainer, Comparable<RMContainer> {
       container.reservedResource = e.getReservedResource();
       container.reservedNode = e.getReservedNode();
       container.reservedSchedulerKey = e.getReservedSchedulerKey();
-      
+
+      Container c = container.getContainer();
+      if (c != null) {
+        c.setNodeId(container.reservedNode);
+      }
+
       if (!EnumSet.of(RMContainerState.NEW, RMContainerState.RESERVED)
           .contains(container.getState())) {
         // When container's state != NEW/RESERVED, it is an increase reservation

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java

@@ -407,7 +407,7 @@ public abstract class SchedulerNode {
    * Set the reserved container in the node.
    * @param reservedContainer Reserved container in the node.
    */
-  protected synchronized void
+  public synchronized void
   setReservedContainer(RMContainer reservedContainer) {
     this.reservedContainer = reservedContainer;
   }

+ 10 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java

@@ -109,6 +109,8 @@ public abstract class AbstractCSQueue implements CSQueue {
   protected ReentrantReadWriteLock.ReadLock readLock;
   protected ReentrantReadWriteLock.WriteLock writeLock;
 
+  volatile Priority priority = Priority.newInstance(0);
+
   public AbstractCSQueue(CapacitySchedulerContext cs,
       String queueName, CSQueue parent, CSQueue old) throws IOException {
     this.labelManager = cs.getRMContext().getNodeLabelManager();
@@ -336,6 +338,9 @@ public abstract class AbstractCSQueue implements CSQueue {
           csContext.getConfiguration().getReservationContinueLook();
 
       this.preemptionDisabled = isQueueHierarchyPreemptionDisabled(this);
+
+      this.priority = csContext.getConfiguration().getQueuePriority(
+          getQueuePath());
     } finally {
       writeLock.unlock();
     }
@@ -934,4 +939,9 @@ public abstract class AbstractCSQueue implements CSQueue {
       this.writeLock.unlock();
     }
   }
+
+  @Override
+  public Priority getPriority() {
+    return this.priority;
+  }
 }

+ 7 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java

@@ -32,6 +32,7 @@ import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ContainerStatus;
+import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.QueueACL;
 import org.apache.hadoop.yarn.api.records.QueueState;
 import org.apache.hadoop.yarn.api.records.Resource;
@@ -372,4 +373,10 @@ public interface CSQueue extends SchedulerQueue<CSQueue> {
    */
   public void validateSubmitApplication(ApplicationId applicationId,
       String userName, String queue) throws AccessControlException;
+
+  /**
+   * Get priority of queue
+   * @return queue priority
+   */
+  Priority getPriority();
 }

+ 65 - 10
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java

@@ -270,16 +270,6 @@ public class CapacityScheduler extends
     this.calculator = rc;
   }
 
-  @Override
-  public Comparator<CSQueue> getNonPartitionedQueueComparator() {
-    return CapacitySchedulerQueueManager.NON_PARTITIONED_QUEUE_COMPARATOR;
-  }
-
-  @Override
-  public PartitionedQueueComparator getPartitionedQueueComparator() {
-    return CapacitySchedulerQueueManager.PARTITIONED_QUEUE_COMPARATOR;
-  }
-
   @Override
   public int getNumClusterNodes() {
     return nodeTracker.nodeCount();
@@ -2512,4 +2502,69 @@ public class CapacityScheduler extends
   public CapacitySchedulerQueueManager getCapacitySchedulerQueueManager() {
     return this.queueManager;
   }
+
+  /**
+   * Try to move a reserved container to a targetNode.
+   * If the targetNode is reserved by another application (other than this one).
+   * The previous reservation will be cancelled.
+   *
+   * @param toBeMovedContainer reserved container will be moved
+   * @param targetNode targetNode
+   * @return true if move succeeded. Return false if the targetNode is reserved by
+   *         a different container or move failed because of any other reasons.
+   */
+  public boolean moveReservedContainer(RMContainer toBeMovedContainer,
+      FiCaSchedulerNode targetNode) {
+    try {
+      writeLock.lock();
+
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Trying to move container=" + toBeMovedContainer + " to node="
+            + targetNode.getNodeID());
+      }
+
+      FiCaSchedulerNode sourceNode = getNode(toBeMovedContainer.getNodeId());
+      if (null == sourceNode) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Failed to move reservation, cannot find source node="
+              + toBeMovedContainer.getNodeId());
+        }
+        return false;
+      }
+
+      // Target node updated?
+      if (getNode(targetNode.getNodeID()) != targetNode) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(
+              "Failed to move reservation, node updated or removed, moving "
+                  + "cancelled.");
+        }
+        return false;
+      }
+
+      // Target node's reservation status changed?
+      if (targetNode.getReservedContainer() != null) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(
+              "Target node's reservation status changed, moving cancelled.");
+        }
+        return false;
+      }
+
+      FiCaSchedulerApp app = getApplicationAttempt(
+          toBeMovedContainer.getApplicationAttemptId());
+      if (null == app) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Cannot find to-be-moved container's application="
+              + toBeMovedContainer.getApplicationAttemptId());
+        }
+        return false;
+      }
+
+      // finally, move the reserved container
+      return app.moveReservation(toBeMovedContainer, sourceNode, targetNode);
+    } finally {
+      writeLock.unlock();
+    }
+  }
 }

+ 212 - 24
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java

@@ -18,19 +18,8 @@
 
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
 
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-import java.util.StringTokenizer;
-
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience.Private;
@@ -44,6 +33,7 @@ import org.apache.hadoop.yarn.api.records.QueueState;
 import org.apache.hadoop.yarn.api.records.ReservationACL;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
 import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
 import org.apache.hadoop.yarn.security.AccessType;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
@@ -51,6 +41,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.placement.UserGroupMappingP
 import org.apache.hadoop.yarn.server.resourcemanager.reservation.ReservationSchedulerConfiguration;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.AppPriorityACLConfigurationParser.AppPriorityACLKeyType;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.policy.PriorityUtilizationQueueOrderingPolicy;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.policy.QueueOrderingPolicy;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.FairOrderingPolicy;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.FifoOrderingPolicy;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.OrderingPolicy;
@@ -59,7 +51,17 @@ import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
-import com.google.common.collect.ImmutableSet;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.StringTokenizer;
 
 public class CapacitySchedulerConfiguration extends ReservationSchedulerConfiguration {
 
@@ -127,14 +129,21 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
   @Private
   public static final String MAXIMUM_ALLOCATION_VCORES =
       "maximum-allocation-vcores";
-  
+
+  /**
+   * Ordering policy of queues
+   */
   public static final String ORDERING_POLICY = "ordering-policy";
-  
-  public static final String FIFO_ORDERING_POLICY = "fifo";
 
-  public static final String FAIR_ORDERING_POLICY = "fair";
+  /*
+   * Ordering policy inside a leaf queue to sort apps
+   */
+  public static final String FIFO_APP_ORDERING_POLICY = "fifo";
+
+  public static final String FAIR_APP_ORDERING_POLICY = "fair";
 
-  public static final String DEFAULT_ORDERING_POLICY = FIFO_ORDERING_POLICY;
+  public static final String DEFAULT_APP_ORDERING_POLICY =
+      FIFO_APP_ORDERING_POLICY;
   
   @Private
   public static final int DEFAULT_MAXIMUM_SYSTEM_APPLICATIIONS = 10000;
@@ -298,6 +307,11 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
     String queueName = PREFIX + queue + DOT;
     return queueName;
   }
+
+  static String getQueueOrderingPolicyPrefix(String queue) {
+    String queueName = PREFIX + queue + DOT + ORDERING_POLICY + DOT;
+    return queueName;
+  }
   
   private String getNodeLabelPrefix(String queue, String label) {
     if (label.equals(CommonNodeLabelsManager.NO_LABEL)) {
@@ -400,20 +414,23 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
         DEFAULT_USER_LIMIT);
     return userLimit;
   }
-  
+
+  // TODO (wangda): We need to better distinguish app ordering policy and queue
+  // ordering policy's classname / configuration options, etc. And dedup code
+  // if possible.
   @SuppressWarnings("unchecked")
-  public <S extends SchedulableEntity> OrderingPolicy<S> getOrderingPolicy(
+  public <S extends SchedulableEntity> OrderingPolicy<S> getAppOrderingPolicy(
       String queue) {
   
-    String policyType = get(getQueuePrefix(queue) + ORDERING_POLICY, 
-      DEFAULT_ORDERING_POLICY);
+    String policyType = get(getQueuePrefix(queue) + ORDERING_POLICY,
+        DEFAULT_APP_ORDERING_POLICY);
     
     OrderingPolicy<S> orderingPolicy;
     
-    if (policyType.trim().equals(FIFO_ORDERING_POLICY)) {
+    if (policyType.trim().equals(FIFO_APP_ORDERING_POLICY)) {
        policyType = FifoOrderingPolicy.class.getName();
     }
-    if (policyType.trim().equals(FAIR_ORDERING_POLICY)) {
+    if (policyType.trim().equals(FAIR_APP_ORDERING_POLICY)) {
        policyType = FairOrderingPolicy.class.getName();
     }
     try {
@@ -734,6 +751,20 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
     return Resources.createResource(maximumMemory, maximumCores);
   }
 
+  @Private
+  public Priority getQueuePriority(String queue) {
+    String queuePolicyPrefix = getQueuePrefix(queue);
+    Priority pri = Priority.newInstance(
+        getInt(queuePolicyPrefix + "priority", 0));
+    return pri;
+  }
+
+  @Private
+  public void setQueuePriority(String queue, int priority) {
+    String queuePolicyPrefix = getQueuePrefix(queue);
+    setInt(queuePolicyPrefix + "priority", priority);
+  }
+
   /**
    * Get the per queue setting for the maximum limit to allocate to
    * each container request.
@@ -1204,4 +1235,161 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
         getInt(QUEUE_GLOBAL_MAX_APPLICATION, (int) UNDEFINED);
     return maxApplicationsPerQueue;
   }
+
+  /**
+   * Ordering policy inside a parent queue to sort queues
+   */
+
+  /**
+   * Less relative usage queue can get next resource, this is default
+   */
+  public static final String QUEUE_UTILIZATION_ORDERING_POLICY = "utilization";
+
+  /**
+   * Combination of relative usage and priority
+   */
+  public static final String QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY =
+      "priority-utilization";
+
+  public static final String DEFAULT_QUEUE_ORDERING_POLICY =
+      QUEUE_UTILIZATION_ORDERING_POLICY;
+
+
+  @Private
+  public void setQueueOrderingPolicy(String queue, String policy) {
+    set(getQueuePrefix(queue) + ORDERING_POLICY, policy);
+  }
+
+  @Private
+  public QueueOrderingPolicy getQueueOrderingPolicy(String queue,
+      String parentPolicy) {
+    String defaultPolicy = parentPolicy;
+    if (null == defaultPolicy) {
+      defaultPolicy = DEFAULT_QUEUE_ORDERING_POLICY;
+    }
+
+    String policyType = get(getQueuePrefix(queue) + ORDERING_POLICY,
+        defaultPolicy);
+
+    QueueOrderingPolicy qop;
+    if (policyType.trim().equals(QUEUE_UTILIZATION_ORDERING_POLICY)) {
+      // Doesn't respect priority
+      qop = new PriorityUtilizationQueueOrderingPolicy(false);
+    } else if (policyType.trim().equals(
+        QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY)) {
+      qop = new PriorityUtilizationQueueOrderingPolicy(true);
+    } else {
+      String message =
+          "Unable to construct queue ordering policy=" + policyType + " queue="
+              + queue;
+      throw new YarnRuntimeException(message);
+    }
+
+    return qop;
+  }
+
+  /*
+   * Get global configuration for ordering policies
+   */
+  private String getOrderingPolicyGlobalConfigKey(String orderPolicyName,
+      String configKey) {
+    return PREFIX + ORDERING_POLICY + DOT + orderPolicyName + DOT + configKey;
+  }
+
+  /**
+   * Global configurations of queue-priority-utilization ordering policy
+   */
+  private static final String UNDER_UTILIZED_PREEMPTION_ENABLED =
+      "underutilized-preemption.enabled";
+
+  /**
+   * Do we allow under-utilized queue with higher priority to preempt queue
+   * with lower priority *even if queue with lower priority is not satisfied*.
+   *
+   * For example, two queues, a and b
+   * a.priority = 1, (a.used-capacity - a.reserved-capacity) = 40%
+   * b.priority = 0, b.used-capacity = 30%
+   *
+   * Set this configuration to true to allow queue-a to preempt container from
+   * queue-b.
+   *
+   * (The reason why deduct reserved-capacity from used-capacity for queue with
+   * higher priority is: the reserved-capacity is just scheduler's internal
+   * implementation to allocate large containers, it is not possible for
+   * application to use such reserved-capacity. It is possible that a queue with
+   * large container requests have a large number of containers but cannot
+   * allocate from any of them. But scheduler will make sure a satisfied queue
+   * will not preempt resource from any other queues. A queue is considered to
+   * be satisfied when queue's used-capacity - reserved-capacity ≥
+   * guaranteed-capacity.)
+   *
+   * @return allowed or not
+   */
+  public boolean getPUOrderingPolicyUnderUtilizedPreemptionEnabled() {
+    return getBoolean(getOrderingPolicyGlobalConfigKey(
+        QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY,
+        UNDER_UTILIZED_PREEMPTION_ENABLED), false);
+  }
+
+  @VisibleForTesting
+  public void setPUOrderingPolicyUnderUtilizedPreemptionEnabled(
+      boolean enabled) {
+    setBoolean(getOrderingPolicyGlobalConfigKey(
+        QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY,
+        UNDER_UTILIZED_PREEMPTION_ENABLED), enabled);
+  }
+
+  private static final String UNDER_UTILIZED_PREEMPTION_DELAY =
+      "underutilized-preemption.reserved-container-delay-ms";
+
+  /**
+   * When a reserved container of an underutilized queue is created. Preemption
+   * will kick in after specified delay (in ms).
+   *
+   * The total time to preempt resources for a reserved container from higher
+   * priority queue will be: reserved-container-delay-ms +
+   * {@link CapacitySchedulerConfiguration#PREEMPTION_WAIT_TIME_BEFORE_KILL}.
+   *
+   * This parameter is added to make preemption from lower priority queue which
+   * is underutilized to be more careful. This parameter takes effect when
+   * underutilized-preemption.enabled set to true.
+   *
+   * @return delay
+   */
+  public long getPUOrderingPolicyUnderUtilizedPreemptionDelay() {
+    return getLong(getOrderingPolicyGlobalConfigKey(
+        QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY,
+        UNDER_UTILIZED_PREEMPTION_DELAY), 60000L);
+  }
+
+  @VisibleForTesting
+  public void setPUOrderingPolicyUnderUtilizedPreemptionDelay(
+      long timeout) {
+    setLong(getOrderingPolicyGlobalConfigKey(
+        QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY,
+        UNDER_UTILIZED_PREEMPTION_DELAY), timeout);
+  }
+
+  private static final String UNDER_UTILIZED_PREEMPTION_MOVE_RESERVATION =
+      "underutilized-preemption.allow-move-reservation";
+
+  /**
+   * When doing preemption from under-satisfied queues for priority queue.
+   * Do we allow move reserved container from one host to another?
+   *
+   * @return allow or not
+   */
+  public boolean getPUOrderingPolicyUnderUtilizedPreemptionMoveReservation() {
+    return getBoolean(getOrderingPolicyGlobalConfigKey(
+        QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY,
+        UNDER_UTILIZED_PREEMPTION_MOVE_RESERVATION), false);
+  }
+
+  @VisibleForTesting
+  public void setPUOrderingPolicyUnderUtilizedPreemptionMoveReservation(
+      boolean allowMoveReservation) {
+    setBoolean(getOrderingPolicyGlobalConfigKey(
+        QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY,
+        UNDER_UTILIZED_PREEMPTION_MOVE_RESERVATION), allowMoveReservation);
+  }
 }

+ 0 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerContext.java

@@ -61,10 +61,6 @@ public interface CapacitySchedulerContext {
   Configuration getConf();
 
   ResourceCalculator getResourceCalculator();
-
-  Comparator<CSQueue> getNonPartitionedQueueComparator();
-  
-  PartitionedQueueComparator getPartitionedQueueComparator();
   
   FiCaSchedulerNode getNode(NodeId nodeId);
 

+ 0 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerQueueManager.java

@@ -75,9 +75,6 @@ public class CapacitySchedulerQueueManager implements SchedulerQueueManager<
     }
   };
 
-  static final PartitionedQueueComparator PARTITIONED_QUEUE_COMPARATOR =
-      new PartitionedQueueComparator();
-
   static class QueueHook {
     public CSQueue hook(CSQueue queue) {
       return queue;

+ 2 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java

@@ -185,7 +185,7 @@ public class LeafQueue extends AbstractCSQueue {
       CapacitySchedulerConfiguration conf = csContext.getConfiguration();
 
       setOrderingPolicy(
-          conf.<FiCaSchedulerApp>getOrderingPolicy(getQueuePath()));
+          conf.<FiCaSchedulerApp>getAppOrderingPolicy(getQueuePath()));
 
       userLimit = conf.getUserLimit(getQueuePath());
       userLimitFactor = conf.getUserLimitFactor(getQueuePath());
@@ -287,7 +287,7 @@ public class LeafQueue extends AbstractCSQueue {
               .toString() + "\n" + "reservationsContinueLooking = "
               + reservationsContinueLooking + "\n" + "preemptionDisabled = "
               + getPreemptionDisabled() + "\n" + "defaultAppPriorityPerQueue = "
-              + defaultAppPriorityPerQueue);
+              + defaultAppPriorityPerQueue + "\npriority = " + priority);
     } finally {
       writeLock.unlock();
     }

+ 39 - 31
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java

@@ -18,18 +18,6 @@
 
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -64,6 +52,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.Activi
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityDiagnosticConstant;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityState;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.AllocationState;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.policy.QueueOrderingPolicy;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ContainerAllocationProposal;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ResourceCommitRequest;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer;
@@ -73,29 +62,34 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.placement.Placeme
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.placement.PlacementSetUtils;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
 @Private
 @Evolving
 public class ParentQueue extends AbstractCSQueue {
 
   private static final Log LOG = LogFactory.getLog(ParentQueue.class);
 
-  protected final Set<CSQueue> childQueues;  
+  protected final List<CSQueue> childQueues;
   private final boolean rootQueue;
-  private final Comparator<CSQueue> nonPartitionedQueueComparator;
-  private final PartitionedQueueComparator partitionQueueComparator;
   private volatile int numApplications;
   private final CapacitySchedulerContext scheduler;
 
   private final RecordFactory recordFactory = 
     RecordFactoryProvider.getRecordFactory(null);
 
+  private QueueOrderingPolicy queueOrderingPolicy;
+
   public ParentQueue(CapacitySchedulerContext cs,
       String queueName, CSQueue parent, CSQueue old) throws IOException {
     super(cs, queueName, parent, old);
     this.scheduler = cs;
-    this.nonPartitionedQueueComparator = cs.getNonPartitionedQueueComparator();
-    this.partitionQueueComparator = new PartitionedQueueComparator();
-
     this.rootQueue = (parent == null);
 
     float rawCapacity = cs.getConfiguration().getNonLabeledQueueCapacity(getQueuePath());
@@ -107,7 +101,7 @@ public class ParentQueue extends AbstractCSQueue {
           ". Must be " + CapacitySchedulerConfiguration.MAXIMUM_CAPACITY_VALUE);
     }
 
-    this.childQueues = new TreeSet<CSQueue>(nonPartitionedQueueComparator);
+    this.childQueues = new ArrayList<>();
 
     setupQueueConfigs(cs.getClusterResource());
 
@@ -116,7 +110,14 @@ public class ParentQueue extends AbstractCSQueue {
         ", fullname=" + getQueuePath());
   }
 
-  void setupQueueConfigs(Resource clusterResource)
+  // returns what is configured queue ordering policy
+  private String getQueueOrderingPolicyConfigName() {
+    return queueOrderingPolicy == null ?
+        null :
+        queueOrderingPolicy.getConfigName();
+  }
+
+  protected void setupQueueConfigs(Resource clusterResource)
       throws IOException {
     try {
       writeLock.lock();
@@ -134,13 +135,22 @@ public class ParentQueue extends AbstractCSQueue {
         }
       }
 
+      // Initialize queue ordering policy
+      queueOrderingPolicy = csContext.getConfiguration().getQueueOrderingPolicy(
+          getQueuePath(), parent == null ?
+              null :
+              ((ParentQueue) parent).getQueueOrderingPolicyConfigName());
+      queueOrderingPolicy.setQueues(childQueues);
+
       LOG.info(queueName + ", capacity=" + this.queueCapacities.getCapacity()
           + ", absoluteCapacity=" + this.queueCapacities.getAbsoluteCapacity()
           + ", maxCapacity=" + this.queueCapacities.getMaximumCapacity()
           + ", absoluteMaxCapacity=" + this.queueCapacities
           .getAbsoluteMaximumCapacity() + ", state=" + getState() + ", acls="
           + aclsString + ", labels=" + labelStrBuilder.toString() + "\n"
-          + ", reservationsContinueLooking=" + reservationsContinueLooking);
+          + ", reservationsContinueLooking=" + reservationsContinueLooking
+          + ", orderingPolicy=" + getQueueOrderingPolicyConfigName()
+          + ", priority=" + priority);
     } finally {
       writeLock.unlock();
     }
@@ -294,8 +304,8 @@ public class ParentQueue extends AbstractCSQueue {
 
       // Re-configure existing child queues and add new ones
       // The CS has already checked to ensure all existing child queues are present!
-      Map<String, CSQueue> currentChildQueues = getQueues(childQueues);
-      Map<String, CSQueue> newChildQueues = getQueues(
+      Map<String, CSQueue> currentChildQueues = getQueuesMap(childQueues);
+      Map<String, CSQueue> newChildQueues = getQueuesMap(
           newlyParsedParentQueue.childQueues);
       for (Map.Entry<String, CSQueue> e : newChildQueues.entrySet()) {
         String newChildQueueName = e.getKey();
@@ -338,7 +348,7 @@ public class ParentQueue extends AbstractCSQueue {
     }
   }
 
-  Map<String, CSQueue> getQueues(Set<CSQueue> queues) {
+  private Map<String, CSQueue> getQueuesMap(List<CSQueue> queues) {
     Map<String, CSQueue> queuesMap = new HashMap<String, CSQueue>();
     for (CSQueue queue : queues) {
       queuesMap.put(queue.getQueueName(), queue);
@@ -680,13 +690,7 @@ public class ParentQueue extends AbstractCSQueue {
 
   private Iterator<CSQueue> sortAndGetChildrenAllocationIterator(
       String partition) {
-    // Previously we keep a sorted list for default partition, it is not good
-    // when multi-threading scheduler is enabled, so to make a simpler code
-    // now re-sort queue every time irrespective to node partition.
-    partitionQueueComparator.setPartitionToLookAt(partition);
-    List<CSQueue> childrenList = new ArrayList<>(childQueues);
-    Collections.sort(childrenList, partitionQueueComparator);
-    return childrenList.iterator();
+    return queueOrderingPolicy.getAssignmentIterator(partition);
   }
 
   private CSAssignment assignContainersToChildQueues(Resource cluster,
@@ -1083,4 +1087,8 @@ public class ParentQueue extends AbstractCSQueue {
       this.writeLock.unlock();
     }
   }
+
+  public QueueOrderingPolicy getQueueOrderingPolicy() {
+    return queueOrderingPolicy;
+  }
 }

+ 0 - 72
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/PartitionedQueueComparator.java

@@ -1,72 +0,0 @@
-/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
-
-import java.util.Comparator;
-
-import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
-
-public class PartitionedQueueComparator implements Comparator<CSQueue> {
-  private String partitionToLookAt = null;
-  
-  public void setPartitionToLookAt(String partitionToLookAt) {
-    this.partitionToLookAt = partitionToLookAt;
-  }
-  
-
-  @Override
-  public int compare(CSQueue q1, CSQueue q2) {
-    /*
-     * 1. Check accessible to given partition, if one queue accessible and
-     * the other not, accessible queue goes first.
-     */
-    boolean q1Accessible =
-        q1.getAccessibleNodeLabels().contains(partitionToLookAt)
-            || q1.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY);
-    boolean q2Accessible =
-        q2.getAccessibleNodeLabels().contains(partitionToLookAt)
-            || q2.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY);
-    if (q1Accessible && !q2Accessible) {
-      return -1;
-    } else if (!q1Accessible && q2Accessible) {
-      return 1;
-    }
-
-    /*
-     * 
-     * 2. When two queue has same accessibility, check who will go first:
-     * Now we simply compare their used resource on the partition to lookAt
-     */
-    float used1 = q1.getQueueCapacities().getUsedCapacity(partitionToLookAt);
-    float used2 = q2.getQueueCapacities().getUsedCapacity(partitionToLookAt);
-    if (Math.abs(used1 - used2) < 1e-6) {
-      // When used capacity is same, compare their guaranteed-capacity
-      float cap1 = q1.getQueueCapacities().getCapacity(partitionToLookAt);
-      float cap2 = q2.getQueueCapacities().getCapacity(partitionToLookAt);
-      
-      // when cap1 == cap2, we will compare queue's name
-      if (Math.abs(cap1 - cap2) < 1e-6) {
-        return q1.getQueueName().compareTo(q2.getQueueName());
-      }
-      return Float.compare(cap2, cap1);
-    }
-    
-    return Float.compare(used1, used2);
-  }
-}

+ 186 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/policy/PriorityUtilizationQueueOrderingPolicy.java

@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.policy;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.function.Supplier;
+
+/**
+ * For two queues with the same priority:
+ * - The queue with less relative used-capacity goes first - today’s behavior.
+ * - The default priority for all queues is 0 and equal. So, we get today’s
+ *   behaviour at every level - the queue with the lowest used-capacity
+ *   percentage gets the resources
+ *
+ * For two queues with different priorities:
+ * - Both the queues are under their guaranteed capacities: The queue with
+ *   the higher priority gets resources
+ * - Both the queues are over or meeting their guaranteed capacities:
+ *   The queue with the higher priority gets resources
+ * - One of the queues is over or meeting their guaranteed capacities and the
+ *   other is under: The queue that is under its capacity guarantee gets the
+ *   resources.
+ */
+public class PriorityUtilizationQueueOrderingPolicy implements QueueOrderingPolicy {
+  private List<CSQueue> queues;
+  private boolean respectPriority;
+
+  // This makes multiple threads can sort queues at the same time
+  // For different partitions.
+  private static ThreadLocal<String> partitionToLookAt =
+      ThreadLocal.withInitial(new Supplier<String>() {
+        @Override
+        public String get() {
+          return RMNodeLabelsManager.NO_LABEL;
+        }
+      });
+
+  /**
+   * Compare two queues with possibly different priority and assigned capacity,
+   * Will be used by preemption policy as well.
+   *
+   * @param relativeAssigned1 relativeAssigned1
+   * @param relativeAssigned2 relativeAssigned2
+   * @param priority1 p1
+   * @param priority2 p2
+   * @return compared result
+   */
+  public static int compare(double relativeAssigned1, double relativeAssigned2,
+      int priority1, int priority2) {
+    if (priority1 == priority2) {
+      // The queue with less relative used-capacity goes first
+      return Double.compare(relativeAssigned1, relativeAssigned2);
+    } else {
+      // When priority is different:
+      if ((relativeAssigned1 < 1.0f && relativeAssigned2 < 1.0f) || (
+          relativeAssigned1 >= 1.0f && relativeAssigned2 >= 1.0f)) {
+        // When both the queues are under their guaranteed capacities,
+        // Or both the queues are over or meeting their guaranteed capacities
+        // queue with higher used-capacity goes first
+        return Integer.compare(priority2, priority1);
+      } else {
+        // Otherwise, when one of the queues is over or meeting their
+        // guaranteed capacities and the other is under: The queue that is
+        // under its capacity guarantee gets the resources.
+        return Double.compare(relativeAssigned1, relativeAssigned2);
+      }
+    }
+  }
+
+  /**
+   * Comparator that both looks at priority and utilization
+   */
+  private class PriorityQueueComparator implements  Comparator<CSQueue> {
+
+    @Override
+    public int compare(CSQueue q1, CSQueue q2) {
+      String p = partitionToLookAt.get();
+
+      int rc = compareQueueAccessToPartition(q1, q2, p);
+      if (0 != rc) {
+        return rc;
+      }
+
+      float used1 = q1.getQueueCapacities().getUsedCapacity(p);
+      float used2 = q2.getQueueCapacities().getUsedCapacity(p);
+      int p1 = 0;
+      int p2 = 0;
+      if (respectPriority) {
+        p1 = q1.getPriority().getPriority();
+        p2 = q2.getPriority().getPriority();
+      }
+
+      rc = PriorityUtilizationQueueOrderingPolicy.compare(used1, used2, p1, p2);
+
+      // For queue with same used ratio / priority, queue with higher configured
+      // capacity goes first
+      if (0 == rc) {
+        float abs1 = q1.getQueueCapacities().getAbsoluteCapacity(p);
+        float abs2 = q2.getQueueCapacities().getAbsoluteCapacity(p);
+        return Float.compare(abs2, abs1);
+      }
+
+      return rc;
+    }
+
+    private int compareQueueAccessToPartition(CSQueue q1, CSQueue q2, String partition) {
+      // Everybody has access to default partition
+      if (StringUtils.equals(partition, RMNodeLabelsManager.NO_LABEL)) {
+        return 0;
+      }
+
+      /*
+       * Check accessible to given partition, if one queue accessible and
+       * the other not, accessible queue goes first.
+       */
+      boolean q1Accessible =
+          q1.getAccessibleNodeLabels() != null && q1.getAccessibleNodeLabels()
+              .contains(partition) || q1.getAccessibleNodeLabels().contains(
+              RMNodeLabelsManager.ANY);
+      boolean q2Accessible =
+          q2.getAccessibleNodeLabels() != null && q2.getAccessibleNodeLabels()
+              .contains(partition) || q2.getAccessibleNodeLabels().contains(
+              RMNodeLabelsManager.ANY);
+      if (q1Accessible && !q2Accessible) {
+        return -1;
+      } else if (!q1Accessible && q2Accessible) {
+        return 1;
+      }
+
+      return 0;
+    }
+  }
+
+  public PriorityUtilizationQueueOrderingPolicy(boolean respectPriority) {
+    this.respectPriority = respectPriority;
+  }
+
+  @Override
+  public void setQueues(List<CSQueue> queues) {
+    this.queues = queues;
+  }
+
+  @Override
+  public Iterator<CSQueue> getAssignmentIterator(String partition) {
+    // Since partitionToLookAt is a thread local variable, and every time we
+    // copy and sort queues, so it's safe for multi-threading environment.
+    PriorityUtilizationQueueOrderingPolicy.partitionToLookAt.set(partition);
+    List<CSQueue> sortedQueue = new ArrayList<>(queues);
+    Collections.sort(sortedQueue, new PriorityQueueComparator());
+    return sortedQueue.iterator();
+  }
+
+  @Override
+  public String getConfigName() {
+    if (respectPriority) {
+      return CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY;
+    } else{
+      return CapacitySchedulerConfiguration.QUEUE_UTILIZATION_ORDERING_POLICY;
+    }
+  }
+}

+ 52 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/policy/QueueOrderingPolicy.java

@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.policy;
+
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
+
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * This will be used by
+ * {@link org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue}
+ * to decide allocation ordering of child queues.
+ */
+public interface QueueOrderingPolicy {
+  void setQueues(List<CSQueue> queues);
+
+  /**
+   * Return an iterator over the collection of CSQueues which orders
+   * them for container assignment.
+   *
+   * Please note that, to avoid queue's set updated during sorting / iterating.
+   * Caller need to make sure parent queue's read lock is properly acquired.
+   *
+   * @param partition nodePartition
+   *
+   * @return iterator of queues to allocate
+   */
+  Iterator<CSQueue> getAssignmentIterator(String partition);
+
+  /**
+   * Returns configuration name (which will be used to set ordering policy
+   * @return configuration name
+   */
+  String getConfigName();
+}

+ 89 - 7
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java

@@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEven
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerFinishedEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerReservedEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
@@ -98,13 +99,13 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
 
   private final Set<ContainerId> containersToPreempt =
     new HashSet<ContainerId>();
-    
+
   private CapacityHeadroomProvider headroomProvider;
 
   private ResourceCalculator rc = new DefaultResourceCalculator();
 
   private ResourceScheduler scheduler;
-  
+
   private AbstractContainerAllocator containerAllocator;
 
   /**
@@ -115,7 +116,7 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
   private Map<ContainerId, SchedContainerChangeRequest> toBeRemovedIncRequests =
       new ConcurrentHashMap<>();
 
-  public FiCaSchedulerApp(ApplicationAttemptId applicationAttemptId, 
+  public FiCaSchedulerApp(ApplicationAttemptId applicationAttemptId,
       String user, Queue queue, ActiveUsersManager activeUsersManager,
       RMContext rmContext) {
     this(applicationAttemptId, user, queue, activeUsersManager, rmContext,
@@ -831,7 +832,7 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
     }
     return null;
   }
-  
+
   public void setHeadroomProvider(
     CapacityHeadroomProvider headroomProvider) {
     try {
@@ -841,7 +842,7 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
       writeLock.unlock();
     }
   }
-  
+
   @Override
   public Resource getHeadroom() {
     try {
@@ -855,7 +856,7 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
     }
 
   }
-  
+
   @Override
   public void transferStateFromPreviousAttempt(
       SchedulerApplicationAttempt appAttempt) {
@@ -867,7 +868,7 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
       writeLock.unlock();
     }
   }
-  
+
   public boolean reserveIncreasedContainer(SchedulerRequestKey schedulerKey,
       FiCaSchedulerNode node,
       RMContainer rmContainer, Resource reservedResource) {
@@ -1148,4 +1149,85 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
   public boolean equals(Object o) {
     return super.equals(o);
   }
+
+  /**
+   * Move reservation from one node to another
+   * Comparing to unreserve container on source node and reserve a new
+   * container on target node. This method will not create new RMContainer
+   * instance. And this operation is atomic.
+   *
+   * @param reservedContainer to be moved reserved container
+   * @param sourceNode source node
+   * @param targetNode target node
+   *
+   * @return succeeded or not
+   */
+  public boolean moveReservation(RMContainer reservedContainer,
+      FiCaSchedulerNode sourceNode, FiCaSchedulerNode targetNode) {
+    try {
+      writeLock.lock();
+      if (!sourceNode.getPartition().equals(targetNode.getPartition())) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(
+              "Failed to move reservation, two nodes are in different partition");
+        }
+        return false;
+      }
+
+      // Update reserved container to node map
+      Map<NodeId, RMContainer> map = reservedContainers.get(
+          reservedContainer.getReservedSchedulerKey());
+      if (null == map) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Cannot find reserved container map.");
+        }
+        return false;
+      }
+
+      // Check if reserved container changed
+      if (sourceNode.getReservedContainer() != reservedContainer) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("To-be-moved container already updated.");
+        }
+        return false;
+      }
+
+      // Check if target node is empty, acquires lock of target node to make sure
+      // reservation happens transactional
+      synchronized (targetNode){
+        if (targetNode.getReservedContainer() != null) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Target node is already occupied before moving");
+          }
+        }
+
+        try {
+          targetNode.reserveResource(this,
+              reservedContainer.getReservedSchedulerKey(), reservedContainer);
+        } catch (IllegalStateException e) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Reserve on target node failed, e=", e);
+          }
+          return false;
+        }
+
+        // Set source node's reserved container to null
+        sourceNode.setReservedContainer(null);
+        map.remove(sourceNode.getNodeID());
+
+        // Update reserved container
+        reservedContainer.handle(
+            new RMContainerReservedEvent(reservedContainer.getContainerId(),
+                reservedContainer.getReservedResource(), targetNode.getNodeID(),
+                reservedContainer.getReservedSchedulerKey()));
+
+        // Add to target node
+        map.put(targetNode.getNodeID(), reservedContainer);
+
+        return true;
+      }
+    } finally {
+      writeLock.unlock();
+    }
+  }
 }

+ 64 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicyMockFramework.java

@@ -36,6 +36,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.policy.QueueOrderingPolicy;
 import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
@@ -61,6 +62,7 @@ import org.mockito.stubbing.Answer;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -476,6 +478,14 @@ public class ProportionalCapacityPreemptionPolicyMockFramework {
    * -B...
    * </pre>
    * ";" splits queues, and there should no empty lines, no extra spaces
+   *
+   * For each queue, it has configurations to specify capacities (to each
+   * partition), format is:
+   * <pre>
+   * -<queueName> (<labelName1>=[guaranteed max used pending], \
+   *               <labelName2>=[guaranteed max used pending])
+   *              {key1=value1,key2=value2};  // Additional configs
+   * </pre>
    */
   @SuppressWarnings({ "unchecked", "rawtypes" })
   private ParentQueue mockQueueHierarchy(String queueExprs) {
@@ -491,6 +501,10 @@ public class ProportionalCapacityPreemptionPolicyMockFramework {
         queue = parentQueue;
         List<CSQueue> children = new ArrayList<CSQueue>();
         when(parentQueue.getChildQueues()).thenReturn(children);
+        QueueOrderingPolicy policy = mock(QueueOrderingPolicy.class);
+        when(policy.getConfigName()).thenReturn(
+            CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY);
+        when(parentQueue.getQueueOrderingPolicy()).thenReturn(policy);
       } else {
         LeafQueue leafQueue = mock(LeafQueue.class);
         final TreeSet<FiCaSchedulerApp> apps = new TreeSet<>(
@@ -625,10 +639,56 @@ public class ProportionalCapacityPreemptionPolicyMockFramework {
     when(queue.getPreemptionDisabled()).thenReturn(
         conf.getPreemptionDisabled(queuePath, false));
 
+    // Setup other queue configurations
+    Map<String, String> otherConfigs = getOtherConfigurations(
+        queueExprArray[idx]);
+    if (otherConfigs.containsKey("priority")) {
+      when(queue.getPriority()).thenReturn(
+          Priority.newInstance(Integer.valueOf(otherConfigs.get("priority"))));
+    } else {
+      // set queue's priority to 0 by default
+      when(queue.getPriority()).thenReturn(Priority.newInstance(0));
+    }
+
+    // Setup disable preemption of queues
+    if (otherConfigs.containsKey("disable_preemption")) {
+      when(queue.getPreemptionDisabled()).thenReturn(
+          Boolean.valueOf(otherConfigs.get("disable_preemption")));
+    }
+
     nameToCSQueues.put(queueName, queue);
     when(cs.getQueue(eq(queueName))).thenReturn(queue);
   }
 
+  /**
+   * Get additional queue's configurations
+   * @param queueExpr queue expr
+   * @return maps of configs
+   */
+  private Map<String, String> getOtherConfigurations(String queueExpr) {
+    if (queueExpr.contains("{")) {
+      int left = queueExpr.indexOf('{');
+      int right = queueExpr.indexOf('}');
+
+      if (right > left) {
+        Map<String, String> configs = new HashMap<>();
+
+        String subStr = queueExpr.substring(left + 1, right);
+        for (String kv : subStr.split(",")) {
+          if (kv.contains("=")) {
+            String key = kv.substring(0, kv.indexOf("="));
+            String value = kv.substring(kv.indexOf("=") + 1);
+            configs.put(key, value);
+          }
+        }
+
+        return configs;
+      }
+    }
+
+    return Collections.EMPTY_MAP;
+  }
+
   /**
    * Level of a queue is how many "-" at beginning, root's level is 0
    */
@@ -739,6 +799,10 @@ public class ProportionalCapacityPreemptionPolicyMockFramework {
     Assert.assertEquals(pending, ru.getPending(partition).getMemorySize());
   }
 
+  public void checkPriority(CSQueue queue, int expectedPriority) {
+    Assert.assertEquals(expectedPriority, queue.getPriority().getPriority());
+  }
+
   public void checkReservedResource(CSQueue queue, String partition, int reserved) {
     ResourceUsage ru = queue.getQueueResourceUsage();
     Assert.assertEquals(reserved, ru.getReserved(partition).getMemorySize());

+ 361 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestPreemptionForQueueWithPriorities.java

@@ -0,0 +1,361 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+
+import static org.mockito.Matchers.argThat;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+public class TestPreemptionForQueueWithPriorities
+    extends ProportionalCapacityPreemptionPolicyMockFramework {
+  @Before
+  public void setup() {
+    super.setup();
+    policy = new ProportionalCapacityPreemptionPolicy(rmContext, cs, mClock);
+  }
+
+  @Test
+  public void testPreemptionForHighestPriorityUnderutilizedQueue()
+      throws IOException {
+    /**
+     * The simplest test of queue with priorities, Queue structure is:
+     *
+     * <pre>
+     *        root
+     *       / |  \
+     *      a  b   c
+     * </pre>
+     *
+     * For priorities
+     * - a=1
+     * - b/c=2
+     *
+     * So c will preempt more resource from a, till a reaches guaranteed
+     * resource.
+     */
+    String labelsConfig = "=100,true"; // default partition
+    String nodesConfig = "n1="; // only one node
+    String queuesConfig =
+        // guaranteed,max,used,pending
+        "root(=[100 100 100 100]);" + //root
+            "-a(=[30 100 40 50]){priority=1};" + // a
+            "-b(=[30 100 59 50]){priority=2};" + // b
+            "-c(=[40 100 1 25]){priority=2}";    // c
+    String appsConfig =
+        //queueName\t(priority,resource,host,expression,#repeat,reserved)
+        "a\t(1,1,n1,,40,false);" + // app1 in a
+        "b\t(1,1,n1,,59,false);" + // app2 in b
+        "c\t(1,1,n1,,1,false);";   // app3 in c
+
+    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
+    policy.editSchedule();
+
+    // 10 preempted from app1, 15 preempted from app2, and nothing preempted
+    // from app3
+    verify(mDisp, times(10)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(1))));
+    verify(mDisp, times(15)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(2))));
+    verify(mDisp, never()).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(3))));
+  }
+
+  @Test
+  public void testPreemptionForLowestPriorityUnderutilizedQueue()
+      throws IOException {
+    /**
+     * Similar to above, make sure we can still make sure less utilized queue
+     * can get resource first regardless of priority.
+     *
+     * Queue structure is:
+     *
+     * <pre>
+     *        root
+     *       / |  \
+     *      a  b   c
+     * </pre>
+     *
+     * For priorities
+     * - a=1
+     * - b=2
+     * - c=0
+     *
+     * So c will preempt more resource from a, till a reaches guaranteed
+     * resource.
+     */
+    String labelsConfig = "=100,true"; // default partition
+    String nodesConfig = "n1="; // only one node
+    String queuesConfig =
+        // guaranteed,max,used,pending
+        "root(=[100 100 100 100]);" + //root
+            "-a(=[30 100 40 50]){priority=1};" + // a
+            "-b(=[30 100 59 50]){priority=2};" + // b
+            "-c(=[40 100 1 25]){priority=0}";    // c
+    String appsConfig =
+        //queueName\t(priority,resource,host,expression,#repeat,reserved)
+            "a\t(1,1,n1,,40,false);" + // app1 in a
+            "b\t(1,1,n1,,59,false);" + // app2 in b
+            "c\t(1,1,n1,,1,false);";   // app3 in c
+
+    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
+    policy.editSchedule();
+
+    // 10 preempted from app1, 15 preempted from app2, and nothing preempted
+    // from app3
+    verify(mDisp, times(10)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(1))));
+    verify(mDisp, times(15)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(2))));
+    verify(mDisp, never()).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(3))));
+  }
+
+  @Test
+  public void testPreemptionWontHappenBetweenSatisfiedQueues()
+      throws IOException {
+    /**
+     * No preemption happen if a queue is already satisfied, regardless of
+     * priority
+     *
+     * Queue structure is:
+     *
+     * <pre>
+     *        root
+     *       / |  \
+     *      a  b   c
+     * </pre>
+     *
+     * For priorities
+     * - a=1
+     * - b=1
+     * - c=2
+     *
+     * When c is satisfied, it will not preempt any resource from other queues
+     */
+    String labelsConfig = "=100,true"; // default partition
+    String nodesConfig = "n1="; // only one node
+    String queuesConfig =
+        // guaranteed,max,used,pending
+        "root(=[100 100 100 100]);" + //root
+            "-a(=[30 100 0 0]){priority=1};" + // a
+            "-b(=[30 100 40 50]){priority=1};" + // b
+            "-c(=[40 100 60 25]){priority=2}";   // c
+    String appsConfig =
+        //queueName\t(priority,resource,host,expression,#repeat,reserved)
+        "b\t(1,1,n1,,40,false);" + // app1 in b
+        "c\t(1,1,n1,,60,false)"; // app2 in c
+
+    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
+    policy.editSchedule();
+
+    // Nothing preempted
+    verify(mDisp, never()).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(1))));
+    verify(mDisp, never()).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(2))));
+  }
+
+  @Test
+  public void testPreemptionForMultipleQueuesInTheSamePriorityBuckets()
+      throws IOException {
+    /**
+     * When a cluster has different priorities, each priority has multiple
+     * queues, preemption policy should try to balance resource between queues
+     * with same priority by ratio of their capacities
+     *
+     * Queue structure is:
+     *
+     * <pre>
+     * root
+     * - a (capacity=10), p=1
+     * - b (capacity=15), p=1
+     * - c (capacity=20), p=2
+     * - d (capacity=25), p=2
+     * - e (capacity=30), p=2
+     * </pre>
+     */
+    String labelsConfig = "=100,true"; // default partition
+    String nodesConfig = "n1="; // only one node
+    String queuesConfig =
+        // guaranteed,max,used,pending
+        "root(=[100 100 100 100]);" + //root
+            "-a(=[10 100 35 50]){priority=1};" + // a
+            "-b(=[15 100 25 50]){priority=1};" + // b
+            "-c(=[20 100 39 50]){priority=2};" + // c
+            "-d(=[25 100 0 0]){priority=2};" + // d
+            "-e(=[30 100 1 99]){priority=2}";   // e
+    String appsConfig =
+        //queueName\t(priority,resource,host,expression,#repeat,reserved)
+        "a\t(1,1,n1,,35,false);" + // app1 in a
+        "b\t(1,1,n1,,25,false);" + // app2 in b
+            "c\t(1,1,n1,,39,false);" + // app3 in c
+            "e\t(1,1,n1,,1,false)"; // app4 in e
+
+    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
+    policy.editSchedule();
+
+    // 23 preempted from app1, 6 preempted from app2, and nothing preempted
+    // from app3/app4
+    // (After preemption, a has 35 - 23 = 12, b has 25 - 6 = 19, so a:b after
+    //  preemption is 1.58, close to 1.50)
+    verify(mDisp, times(23)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(1))));
+    verify(mDisp, times(6)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(2))));
+    verify(mDisp, never()).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(3))));
+    verify(mDisp, never()).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(4))));
+  }
+
+  @Test
+  public void testPreemptionForPriorityAndDisablePreemption()
+      throws IOException {
+    /**
+     * When a cluster has different priorities, each priority has multiple
+     * queues, preemption policy should try to balance resource between queues
+     * with same priority by ratio of their capacities.
+     *
+     * But also we need to make sure preemption disable will be honered
+     * regardless of priority.
+     *
+     * Queue structure is:
+     *
+     * <pre>
+     * root
+     * - a (capacity=10), p=1
+     * - b (capacity=15), p=1
+     * - c (capacity=20), p=2
+     * - d (capacity=25), p=2
+     * - e (capacity=30), p=2
+     * </pre>
+     */
+    String labelsConfig = "=100,true"; // default partition
+    String nodesConfig = "n1="; // only one node
+    String queuesConfig =
+        // guaranteed,max,used,pending
+        "root(=[100 100 100 100]);" + //root
+            "-a(=[10 100 35 50]){priority=1,disable_preemption=true};" + // a
+            "-b(=[15 100 25 50]){priority=1};" + // b
+            "-c(=[20 100 39 50]){priority=2};" + // c
+            "-d(=[25 100 0 0]){priority=2};" + // d
+            "-e(=[30 100 1 99]){priority=2}";   // e
+    String appsConfig =
+        //queueName\t(priority,resource,host,expression,#repeat,reserved)
+        "a\t(1,1,n1,,35,false);" + // app1 in a
+            "b\t(1,1,n1,,25,false);" + // app2 in b
+            "c\t(1,1,n1,,39,false);" + // app3 in c
+            "e\t(1,1,n1,,1,false)"; // app4 in e
+
+    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
+    policy.editSchedule();
+
+    // We suppose to preempt some resource from A, but now since queueA
+    // disables preemption, so we need to preempt some resource from B and
+    // some from C even if C has higher priority than A
+    verify(mDisp, never()).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(1))));
+    verify(mDisp, times(9)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(2))));
+    verify(mDisp, times(19)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(3))));
+    verify(mDisp, never()).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(4))));
+  }
+
+  @Test
+  public void testPriorityPreemptionForHierarchicalOfQueues()
+      throws IOException {
+    /**
+     * When a queue has multiple hierarchy and different priorities:
+     *
+     * <pre>
+     * root
+     * - a (capacity=30), p=1
+     *   - a1 (capacity=40), p=1
+     *   - a2 (capacity=60), p=1
+     * - b (capacity=30), p=1
+     *   - b1 (capacity=50), p=1
+     *   - b1 (capacity=50), p=2
+     * - c (capacity=40), p=2
+     * </pre>
+     */
+    String labelsConfig = "=100,true"; // default partition
+    String nodesConfig = "n1="; // only one node
+    String queuesConfig =
+        // guaranteed,max,used,pending
+        "root(=[100 100 100 100]);" + //root
+            "-a(=[30 100 40 50]){priority=1};" + // a
+            "--a1(=[12 100 20 50]){priority=1};" + // a1
+            "--a2(=[18 100 20 50]){priority=1};" + // a2
+            "-b(=[30 100 59 50]){priority=1};" + // b
+            "--b1(=[15 100 30 50]){priority=1};" + // b1
+            "--b2(=[15 100 29 50]){priority=2};" + // b2
+            "-c(=[40 100 1 30]){priority=1}";   // c
+    String appsConfig =
+        //queueName\t(priority,resource,host,expression,#repeat,reserved)
+        "a1\t(1,1,n1,,20,false);" + // app1 in a1
+            "a2\t(1,1,n1,,20,false);" + // app2 in a2
+            "b1\t(1,1,n1,,30,false);" + // app3 in b1
+            "b2\t(1,1,n1,,29,false);" + // app4 in b2
+            "c\t(1,1,n1,,29,false)"; // app5 in c
+
+
+    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
+    policy.editSchedule();
+
+    // Preemption should first divide capacities between a / b, and b2 should
+    // get less preemption than b1 (because b2 has higher priority)
+    verify(mDisp, times(5)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(1))));
+    verify(mDisp, never()).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(2))));
+    verify(mDisp, times(15)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(3))));
+    verify(mDisp, times(9)).handle(argThat(
+        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
+            getAppAttemptId(4))));
+  }
+
+}

+ 21 - 10
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicy.java

@@ -36,6 +36,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingMonitor;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.policy.QueueOrderingPolicy;
 import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
@@ -220,7 +221,9 @@ public class TestProportionalCapacityPreemptionPolicy {
     };
     ProportionalCapacityPreemptionPolicy policy = buildPolicy(qData);
     policy.editSchedule();
-    verify(mDisp, times(16)).handle(argThat(new IsPreemptionRequestFor(appA)));
+
+    // A will preempt guaranteed-allocated.
+    verify(mDisp, times(10)).handle(argThat(new IsPreemptionRequestFor(appA)));
   }
   
   @Test
@@ -588,8 +591,8 @@ public class TestProportionalCapacityPreemptionPolicy {
     };
     ProportionalCapacityPreemptionPolicy policy = buildPolicy(qData);
     policy.editSchedule();
-    // correct imbalance between over-capacity queues
-    verify(mDisp, times(5)).handle(argThat(new IsPreemptionRequestFor(appA)));
+    // Will not preempt for over capacity queues
+    verify(mDisp, never()).handle(argThat(new IsPreemptionRequestFor(appA)));
   }
 
   @Test
@@ -702,7 +705,7 @@ public class TestProportionalCapacityPreemptionPolicy {
   public void testZeroGuarOverCap() {
     int[][] qData = new int[][] {
       //  /    A   B   C    D   E   F
-         { 200, 100, 0, 99, 0, 100, 100 },  // abs
+         { 200, 100, 0, 100, 0, 100, 100 },  // abs
         { 200, 200, 200, 200, 200, 200, 200 },  // maxCap
         { 170,  170, 60, 20, 90, 0,  0 },  // used
         {  85,   50,  30,  10,  10,  20, 20 },  // pending
@@ -713,14 +716,14 @@ public class TestProportionalCapacityPreemptionPolicy {
     };
     ProportionalCapacityPreemptionPolicy policy = buildPolicy(qData);
     policy.editSchedule();
-    // we verify both that C has priority on B and D (has it has >0 guarantees)
-    // and that B and D are force to share their over capacity fairly (as they
-    // are both zero-guarantees) hence D sees some of its containers preempted
-    verify(mDisp, times(15)).handle(argThat(new IsPreemptionRequestFor(appC)));
+    // No preemption should happen because zero guaranteed queues should be
+    // treated as always satisfied, they should not preempt from each other.
+    verify(mDisp, never()).handle(argThat(new IsPreemptionRequestFor(appA)));
+    verify(mDisp, never()).handle(argThat(new IsPreemptionRequestFor(appB)));
+    verify(mDisp, never()).handle(argThat(new IsPreemptionRequestFor(appC)));
+    verify(mDisp, never()).handle(argThat(new IsPreemptionRequestFor(appD)));
   }
   
-  
-  
   @Test
   public void testHierarchicalLarge() {
     int[][] qData = new int[][] {
@@ -1232,6 +1235,13 @@ public class TestProportionalCapacityPreemptionPolicy {
     when(pq.getChildQueues()).thenReturn(cqs);
     ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
     when(pq.getReadLock()).thenReturn(lock.readLock());
+
+    // Ordering policy
+    QueueOrderingPolicy policy = mock(QueueOrderingPolicy.class);
+    when(policy.getConfigName()).thenReturn(
+        CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY);
+    when(pq.getQueueOrderingPolicy()).thenReturn(policy);
+    when(pq.getPriority()).thenReturn(Priority.newInstance(0));
     for (int i = 0; i < subqueues; ++i) {
       pqs.add(pq);
     }
@@ -1302,6 +1312,7 @@ public class TestProportionalCapacityPreemptionPolicy {
     }
     ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
     when(lq.getReadLock()).thenReturn(lock.readLock());
+    when(lq.getPriority()).thenReturn(Priority.newInstance(0));
     p.getChildQueues().add(lq);
     return lq;
   }

+ 5 - 6
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyForNodePartitions.java

@@ -95,7 +95,7 @@ public class TestProportionalCapacityPreemptionPolicyForNodePartitions
   }
 
   @Test
-  public void testNodePartitionPreemptionRespectMaximumCapacity()
+  public void testNodePartitionPreemptionNotHappenBetweenSatisfiedQueues()
       throws IOException {
     /**
      * Queue structure is:
@@ -114,8 +114,8 @@ public class TestProportionalCapacityPreemptionPolicyForNodePartitions
      * 2 apps in cluster.
      * app1 in b and app2 in c.
      *
-     * app1 uses 90x, and app2 use 10x. After preemption, app2 will preempt 10x
-     * from app1 because of max capacity.
+     * app1 uses 90x, and app2 use 10x. We don't expect preemption happen
+     * between them because all of them are satisfied
      */
     String labelsConfig =
         "=100,true;" + // default partition
@@ -139,9 +139,8 @@ public class TestProportionalCapacityPreemptionPolicyForNodePartitions
     buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
     policy.editSchedule();
 
-    // 30 preempted from app1, 30 preempted from app4, and nothing preempted
-    // from app2/app3
-    verify(mDisp, times(20)).handle(
+    // No preemption happens
+    verify(mDisp, never()).handle(
         argThat(new IsPreemptionRequestFor(getAppAttemptId(1))));
     verify(mDisp, never()).handle(
         argThat(new IsPreemptionRequestFor(getAppAttemptId(2))));

+ 10 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyMockFramework.java

@@ -46,8 +46,8 @@ public class TestProportionalCapacityPreemptionPolicyMockFramework
         "root(=[200 200 100 100],red=[100 100 100 100],blue=[200 200 200 200]);" + //root
             "-a(=[100 200 100 100],red=[0 0 0 0],blue=[200 200 200 200]);" + // a
             "--a1(=[50 100 50 100],red=[0 0 0 0],blue=[100 200 200 0]);" + // a1
-            "--a2(=[50 200 50 0],red=[0 0 0 0],blue=[100 200 0 200]);" + // a2
-            "-b(=[100 200 0 0],red=[100 100 100 100],blue=[0 0 0 0])";
+            "--a2(=[50 200 50 0],red=[0 0 0 0],blue=[100 200 0 200]){priority=2};" + // a2
+            "-b(=[100 200 0 0],red=[100 100 100 100],blue=[0 0 0 0]){priority=1,disable_preemption=true}";
     String appsConfig=
         //queueName\t(priority,resource,host,expression,#repeat,reserved)
         // app1 in a1, , 50 in n2 (reserved), 50 in n2 (allocated)
@@ -75,6 +75,7 @@ public class TestProportionalCapacityPreemptionPolicyMockFramework
     checkPendingResource(cs.getQueue("root"), "red", 100);
     checkAbsCapacities(cs.getQueue("root"), "blue", 1f, 1f, 1f);
     checkPendingResource(cs.getQueue("root"), "blue", 200);
+    checkPriority(cs.getQueue("root"), 0); // default
 
     // a
     checkAbsCapacities(cs.getQueue("a"), "", 0.5f, 1f, 0.5f);
@@ -83,6 +84,7 @@ public class TestProportionalCapacityPreemptionPolicyMockFramework
     checkPendingResource(cs.getQueue("a"), "red", 0);
     checkAbsCapacities(cs.getQueue("a"), "blue", 1f, 1f, 1f);
     checkPendingResource(cs.getQueue("a"), "blue", 200);
+    checkPriority(cs.getQueue("a"), 0); // default
 
     // a1
     checkAbsCapacities(cs.getQueue("a1"), "", 0.25f, 0.5f, 0.25f);
@@ -91,6 +93,7 @@ public class TestProportionalCapacityPreemptionPolicyMockFramework
     checkPendingResource(cs.getQueue("a1"), "red", 0);
     checkAbsCapacities(cs.getQueue("a1"), "blue", 0.5f, 1f, 1f);
     checkPendingResource(cs.getQueue("a1"), "blue", 0);
+    checkPriority(cs.getQueue("a1"), 0); // default
 
     // a2
     checkAbsCapacities(cs.getQueue("a2"), "", 0.25f, 1f, 0.25f);
@@ -99,14 +102,18 @@ public class TestProportionalCapacityPreemptionPolicyMockFramework
     checkPendingResource(cs.getQueue("a2"), "red", 0);
     checkAbsCapacities(cs.getQueue("a2"), "blue", 0.5f, 1f, 0f);
     checkPendingResource(cs.getQueue("a2"), "blue", 200);
+    checkPriority(cs.getQueue("a2"), 2);
+    Assert.assertFalse(cs.getQueue("a2").getPreemptionDisabled());
 
-    // b1
+    // b
     checkAbsCapacities(cs.getQueue("b"), "", 0.5f, 1f, 0f);
     checkPendingResource(cs.getQueue("b"), "", 0);
     checkAbsCapacities(cs.getQueue("b"), "red", 1f, 1f, 1f);
     checkPendingResource(cs.getQueue("b"), "red", 100);
     checkAbsCapacities(cs.getQueue("b"), "blue", 0f, 0f, 0f);
     checkPendingResource(cs.getQueue("b"), "blue", 0);
+    checkPriority(cs.getQueue("b"), 1);
+    Assert.assertTrue(cs.getQueue("b").getPreemptionDisabled());
 
     // Check ignored partitioned containers in queue
     Assert.assertEquals(100, ((LeafQueue) cs.getQueue("a1"))

+ 19 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerPreemptionTestBase.java

@@ -46,7 +46,7 @@ public class CapacitySchedulerPreemptionTestBase {
 
   final int GB = 1024;
 
-  Configuration conf;
+  CapacitySchedulerConfiguration conf;
 
   RMNodeLabelsManager mgr;
 
@@ -54,13 +54,15 @@ public class CapacitySchedulerPreemptionTestBase {
 
   @Before
   void setUp() throws Exception {
-    conf = new YarnConfiguration();
+    conf = new CapacitySchedulerConfiguration();
     conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
         ResourceScheduler.class);
     conf.setBoolean(YarnConfiguration.RM_SCHEDULER_ENABLE_MONITORS, true);
     conf.setClass(YarnConfiguration.RM_SCHEDULER_MONITOR_POLICIES,
         ProportionalCapacityPreemptionPolicy.class, SchedulingEditPolicy.class);
-    conf = TestUtils.getConfigurationWithMultipleQueues(this.conf);
+    conf = (CapacitySchedulerConfiguration) TestUtils
+        .getConfigurationWithMultipleQueues(this.conf);
+    conf.setInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, 100 * GB);
 
     // Set preemption related configurations
     conf.setInt(CapacitySchedulerConfiguration.PREEMPTION_WAIT_TIME_BEFORE_KILL,
@@ -146,4 +148,18 @@ public class CapacitySchedulerPreemptionTestBase {
 
     Assert.fail();
   }
+
+  public void checkNumberOfPreemptionCandidateFromApp(
+      ProportionalCapacityPreemptionPolicy policy, int expected,
+      ApplicationAttemptId attemptId) {
+    int total = 0;
+
+    for (RMContainer rmContainer : policy.getToPreemptContainers().keySet()) {
+      if (rmContainer.getApplicationAttemptId().equals(attemptId)) {
+        ++ total;
+      }
+    }
+
+    Assert.assertEquals(expected, total);
+  }
 }

+ 0 - 9
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java

@@ -110,9 +110,6 @@ public class TestApplicationLimits {
         thenReturn(Resources.createResource(16*GB, 32));
     when(csContext.getClusterResource()).
         thenReturn(Resources.createResource(10 * 16 * GB, 10 * 32));
-    when(csContext.getNonPartitionedQueueComparator()).
-        thenReturn(
-            CapacitySchedulerQueueManager.NON_PARTITIONED_QUEUE_COMPARATOR);
     when(csContext.getResourceCalculator()).
         thenReturn(resourceCalculator);
     when(csContext.getRMContext()).thenReturn(rmContext);
@@ -276,9 +273,6 @@ public class TestApplicationLimits {
         thenReturn(Resources.createResource(GB, 1));
     when(csContext.getMaximumResourceCapability()).
         thenReturn(Resources.createResource(16*GB, 16));
-    when(csContext.getNonPartitionedQueueComparator()).
-        thenReturn(
-            CapacitySchedulerQueueManager.NON_PARTITIONED_QUEUE_COMPARATOR);
     when(csContext.getResourceCalculator()).thenReturn(resourceCalculator);
     when(csContext.getRMContext()).thenReturn(rmContext);
     when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
@@ -581,9 +575,6 @@ public class TestApplicationLimits {
         thenReturn(Resources.createResource(GB));
     when(csContext.getMaximumResourceCapability()).
         thenReturn(Resources.createResource(16*GB));
-    when(csContext.getNonPartitionedQueueComparator()).
-        thenReturn(
-            CapacitySchedulerQueueManager.NON_PARTITIONED_QUEUE_COMPARATOR);
     when(csContext.getResourceCalculator()).thenReturn(resourceCalculator);
     when(csContext.getRMContext()).thenReturn(rmContext);
     

+ 0 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimitsByPartition.java

@@ -594,9 +594,6 @@ public class TestApplicationLimitsByPartition {
         .thenReturn(Resources.createResource(GB));
     when(csContext.getMaximumResourceCapability())
         .thenReturn(Resources.createResource(16 * GB));
-    when(csContext.getNonPartitionedQueueComparator())
-        .thenReturn(
-            CapacitySchedulerQueueManager.NON_PARTITIONED_QUEUE_COMPARATOR);
     when(csContext.getResourceCalculator()).thenReturn(resourceCalculator);
     RMContext rmContext = TestUtils.getMockRMContext();
     RMContext spyRMContext = spy(rmContext);

+ 570 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerSurgicalPreemption.java

@@ -22,22 +22,26 @@ import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.ResourceRequest;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
 import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
 import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
 import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy;
 import org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
 import org.apache.hadoop.yarn.util.resource.Resources;
 import org.junit.Assert;
 import org.junit.Before;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Set;
 
 public class TestCapacitySchedulerSurgicalPreemption
     extends CapacitySchedulerPreemptionTestBase {
@@ -167,8 +171,7 @@ public class TestCapacitySchedulerSurgicalPreemption
      *
      * 1) Two nodes (n1/n2) in the cluster, each of them has 20G.
      *
-     * 2) app1 submit to queue-a first, it asked 38 * 1G containers
-     * We will allocate 20 on n1 and 19 on n2.
+     * 2) app1 submit to queue-b, asks for 1G * 5
      *
      * 3) app2 submit to queue-c, ask for one 4G container (for AM)
      *
@@ -243,4 +246,569 @@ public class TestCapacitySchedulerSurgicalPreemption
 
     rm1.close();
   }
+
+  @Test(timeout = 60000)
+  public void testPriorityPreemptionWhenAllQueuesAreBelowGuaranteedCapacities()
+      throws Exception {
+    /**
+     * Test case: Submit two application (app1/app2) to different queues, queue
+     * structure:
+     *
+     * <pre>
+     *             Root
+     *            /  |  \
+     *           a   b   c
+     *          10   20  70
+     * </pre>
+     *
+     * 1) Two nodes (n1/n2) in the cluster, each of them has 20G.
+     *
+     * 2) app1 submit to queue-b first, it asked 6 * 1G containers
+     * We will allocate 4 on n1 (including AM) and 3 on n2.
+     *
+     * 3) app2 submit to queue-c, ask for one 18G container (for AM)
+     *
+     * After preemption, we should expect:
+     * Preempt 3 containers from app1 and AM of app2 successfully allocated.
+     */
+    conf.setPUOrderingPolicyUnderUtilizedPreemptionEnabled(true);
+    conf.setPUOrderingPolicyUnderUtilizedPreemptionDelay(1000);
+    conf.setQueueOrderingPolicy(CapacitySchedulerConfiguration.ROOT,
+        CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY);
+
+    // Queue c has higher priority than a/b
+    conf.setQueuePriority(CapacitySchedulerConfiguration.ROOT + ".c", 1);
+
+    MockRM rm1 = new MockRM(conf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+
+    MockNM nm1 = rm1.registerNode("h1:1234", 20 * GB);
+    MockNM nm2 = rm1.registerNode("h2:1234", 20 * GB);
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+    RMNode rmNode2 = rm1.getRMContext().getRMNodes().get(nm2.getNodeId());
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "b");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    am1.allocate("*", 1 * GB, 6, new ArrayList<>());
+
+    // Do allocation for node1/node2
+    for (int i = 0; i < 3; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+    }
+
+    // App1 should have 7 containers now, so the abs-used-cap of b is
+    // 7 / 40 = 17.5% < 20% (guaranteed)
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+    // 4 from n1 and 3 from n2
+    waitNumberOfLiveContainersOnNodeFromApp(cs.getNode(rmNode1.getNodeID()),
+        am1.getApplicationAttemptId(), 4);
+    waitNumberOfLiveContainersOnNodeFromApp(cs.getNode(rmNode2.getNodeID()),
+        am1.getApplicationAttemptId(), 3);
+
+    // Submit app2 to queue-c and asks for a 1G container for AM
+    RMApp app2 = rm1.submitApp(18 * GB, "app", "user", null, "c");
+    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(
+        ApplicationAttemptId.newInstance(app2.getApplicationId(), 1));
+
+    while (cs.getNode(rmNode1.getNodeID()).getReservedContainer() == null) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+      Thread.sleep(10);
+    }
+
+    // Call editSchedule immediately: containers are not selected
+    ProportionalCapacityPreemptionPolicy editPolicy =
+        (ProportionalCapacityPreemptionPolicy) getSchedulingEditPolicy(rm1);
+    editPolicy.editSchedule();
+    Assert.assertEquals(0, editPolicy.getToPreemptContainers().size());
+
+    // Sleep the timeout interval, we should be able to see containers selected
+    Thread.sleep(1000);
+    editPolicy.editSchedule();
+    Assert.assertEquals(2, editPolicy.getToPreemptContainers().size());
+
+    // Call editSchedule again: selected containers are killed, and new AM
+    // container launched
+    editPolicy.editSchedule();
+
+    // Do allocation till reserved container allocated
+    while (cs.getNode(rmNode1.getNodeID()).getReservedContainer() != null) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+      Thread.sleep(10);
+    }
+
+    waitNumberOfLiveContainersFromApp(schedulerApp2, 1);
+
+    rm1.close();
+  }
+
+  @Test(timeout = 300000)
+  public void testPriorityPreemptionRequiresMoveReservation()
+      throws Exception {
+    /**
+     * Test case: Submit two application (app1/app2) to different queues, queue
+     * structure:
+     *
+     * <pre>
+     *             Root
+     *            /  |  \
+     *           a   b   c
+     *          10   20  70
+     * </pre>
+     *
+     * 1) 3 nodes in the cluster, 10G for each
+     *
+     * 2) app1 submit to queue-b first, it asked 2G each,
+     *    it can get 2G on n1 (AM), 2 * 2G on n2
+     *
+     * 3) app2 submit to queue-c, with 2G AM container (allocated on n3)
+     *    app2 requires 9G resource, which will be reserved on n3
+     *
+     * We should expect container unreserved from n3 and allocated on n1/n2
+     */
+    conf.setPUOrderingPolicyUnderUtilizedPreemptionEnabled(true);
+    conf.setPUOrderingPolicyUnderUtilizedPreemptionDelay(1000);
+    conf.setQueueOrderingPolicy(CapacitySchedulerConfiguration.ROOT,
+        CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY);
+    conf.setPUOrderingPolicyUnderUtilizedPreemptionMoveReservation(true);
+
+    // Queue c has higher priority than a/b
+    conf.setQueuePriority(CapacitySchedulerConfiguration.ROOT + ".c", 1);
+
+    MockRM rm1 = new MockRM(conf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+
+    MockNM nm1 = rm1.registerNode("h1:1234", 10 * GB);
+    MockNM nm2 = rm1.registerNode("h2:1234", 10 * GB);
+    MockNM nm3 = rm1.registerNode("h3:1234", 10 * GB);
+
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+    RMNode rmNode2 = rm1.getRMContext().getRMNodes().get(nm2.getNodeId());
+    RMNode rmNode3 = rm1.getRMContext().getRMNodes().get(nm3.getNodeId());
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(2 * GB, "app", "user", null, "b");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    am1.allocate("*", 2 * GB, 2, new ArrayList<>());
+
+    // Do allocation for node2 twice
+    for (int i = 0; i < 2; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+    }
+
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(3, schedulerApp1.getLiveContainers().size());
+
+    // 1 from n1 and 2 from n2
+    waitNumberOfLiveContainersOnNodeFromApp(cs.getNode(rmNode1.getNodeID()),
+        am1.getApplicationAttemptId(), 1);
+    waitNumberOfLiveContainersOnNodeFromApp(cs.getNode(rmNode2.getNodeID()),
+        am1.getApplicationAttemptId(), 2);
+
+    // Submit app2 to queue-c and asks for a 2G container for AM, on n3
+    RMApp app2 = rm1.submitApp(2 * GB, "app", "user", null, "c");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm3);
+    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(
+        ApplicationAttemptId.newInstance(app2.getApplicationId(), 1));
+
+    // Asks 1 * 9G container
+    am2.allocate("*", 9 * GB, 1, new ArrayList<>());
+
+    // Do allocation for node3 once
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode3));
+
+    // Make sure container reserved on node3
+    Assert.assertNotNull(
+        cs.getNode(rmNode3.getNodeID()).getReservedContainer());
+
+    // Call editSchedule immediately: nothing happens
+    ProportionalCapacityPreemptionPolicy editPolicy =
+        (ProportionalCapacityPreemptionPolicy) getSchedulingEditPolicy(rm1);
+    editPolicy.editSchedule();
+    Assert.assertNotNull(
+        cs.getNode(rmNode3.getNodeID()).getReservedContainer());
+
+    // Sleep the timeout interval, we should be able to see reserved container
+    // moved to n2 (n1 occupied by AM)
+    Thread.sleep(1000);
+    editPolicy.editSchedule();
+    Assert.assertNull(
+        cs.getNode(rmNode3.getNodeID()).getReservedContainer());
+    Assert.assertNotNull(
+        cs.getNode(rmNode2.getNodeID()).getReservedContainer());
+    Assert.assertEquals(am2.getApplicationAttemptId(), cs.getNode(
+        rmNode2.getNodeID()).getReservedContainer().getApplicationAttemptId());
+
+    // Do it again, we should see containers marked to be preempt
+    editPolicy.editSchedule();
+    Assert.assertEquals(2, editPolicy.getToPreemptContainers().size());
+
+    // Call editSchedule again: selected containers are killed
+    editPolicy.editSchedule();
+
+    // Do allocation till reserved container allocated
+    while (schedulerApp2.getLiveContainers().size() < 2) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+      Thread.sleep(200);
+    }
+
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 1);
+
+    rm1.close();
+  }
+
+  @Test(timeout = 60000)
+  public void testPriorityPreemptionOnlyTriggeredWhenDemandingQueueUnsatisfied()
+      throws Exception {
+    /**
+     * Test case: Submit two application (app1/app2) to different queues, queue
+     * structure:
+     *
+     * <pre>
+     *             Root
+     *            /  |  \
+     *           a   b   c
+     *          10   20  70
+     * </pre>
+     *
+     * 1) 10 nodes (n0-n9) in the cluster, each of them has 10G.
+     *
+     * 2) app1 submit to queue-b first, it asked 8 * 1G containers
+     * We will allocate 1 container on each of n0-n10
+     *
+     * 3) app2 submit to queue-c, ask for 10 * 10G containers (including AM)
+     *
+     * After preemption, we should expect:
+     * Preempt 7 containers from app1 and usage of app2 is 70%
+     */
+    conf.setPUOrderingPolicyUnderUtilizedPreemptionEnabled(true);
+    conf.setPUOrderingPolicyUnderUtilizedPreemptionDelay(1000);
+    conf.setQueueOrderingPolicy(CapacitySchedulerConfiguration.ROOT,
+        CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY);
+
+    // Queue c has higher priority than a/b
+    conf.setQueuePriority(CapacitySchedulerConfiguration.ROOT + ".c", 1);
+
+    MockRM rm1 = new MockRM(conf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+
+    MockNM[] mockNMs = new MockNM[10];
+    for (int i = 0; i < 10; i++) {
+      mockNMs[i] = rm1.registerNode("h" + i + ":1234", 10 * GB);
+    }
+
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+
+    RMNode[] rmNodes = new RMNode[10];
+    for (int i = 0; i < 10; i++) {
+      rmNodes[i] = rm1.getRMContext().getRMNodes().get(mockNMs[i].getNodeId());
+    }
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "b");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, mockNMs[0]);
+
+    am1.allocate("*", 1 * GB, 8, new ArrayList<>());
+
+    // Do allocation for nm1-nm8
+    for (int i = 1; i < 9; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+
+    // App1 should have 9 containers now, so the abs-used-cap of b is 9%
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(9, schedulerApp1.getLiveContainers().size());
+    for (int i = 0; i < 9; i++) {
+      waitNumberOfLiveContainersOnNodeFromApp(cs.getNode(rmNodes[i].getNodeID()),
+          am1.getApplicationAttemptId(), 1);
+    }
+
+    // Submit app2 to queue-c and asks for a 10G container for AM
+    // Launch AM in NM9
+    RMApp app2 = rm1.submitApp(10 * GB, "app", "user", null, "c");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, mockNMs[9]);
+    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(
+        ApplicationAttemptId.newInstance(app2.getApplicationId(), 1));
+
+    // Ask 10 * 10GB containers
+    am2.allocate("*", 10 * GB, 10, new ArrayList<>());
+
+    // Do allocation for all nms
+    for (int i = 1; i < 10; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+
+    // Check am2 reserved resource from nm1-nm9
+    for (int i = 1; i < 9; i++) {
+      Assert.assertNotNull("Should reserve on nm-" + i,
+          cs.getNode(rmNodes[i].getNodeID()).getReservedContainer());
+    }
+
+    // Sleep the timeout interval, we should be able to see 6 containers selected
+    // 6 (selected) + 1 (allocated) which makes target capacity to 70%
+    Thread.sleep(1000);
+
+    ProportionalCapacityPreemptionPolicy editPolicy =
+        (ProportionalCapacityPreemptionPolicy) getSchedulingEditPolicy(rm1);
+    editPolicy.editSchedule();
+    checkNumberOfPreemptionCandidateFromApp(editPolicy, 6,
+        am1.getApplicationAttemptId());
+
+    // Call editSchedule again: selected containers are killed
+    editPolicy.editSchedule();
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 3);
+
+    // Do allocation for all nms
+    for (int i = 1; i < 10; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+
+    waitNumberOfLiveContainersFromApp(schedulerApp2, 7);
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 3);
+
+    rm1.close();
+  }
+
+  @Test(timeout = 600000)
+  public void testPriorityPreemptionFromHighestPriorityQueueAndOldestContainer()
+      throws Exception {
+    /**
+     * Test case: Submit two application (app1/app2) to different queues, queue
+     * structure:
+     *
+     * <pre>
+     *             Root
+     *            /  |  \
+     *           a   b   c
+     *          45  45  10
+     * </pre>
+     *
+     * Priority of queue_a = 1
+     * Priority of queue_b = 2
+     *
+     * 1) 5 nodes (n0-n4) in the cluster, each of them has 4G.
+     *
+     * 2) app1 submit to queue-c first (AM=1G), it asked 4 * 1G containers
+     *    We will allocate 1 container on each of n0-n4. AM on n4.
+     *
+     * 3) app2 submit to queue-a, AM container=0.5G, allocated on n0
+     *    Ask for 2 * 3.5G containers. (Reserved on n0/n1)
+     *
+     * 4) app2 submit to queue-b, AM container=0.5G, allocated on n2
+     *    Ask for 2 * 3.5G containers. (Reserved on n2/n3)
+     *
+     * First we will preempt container on n2 since it is the oldest container of
+     * Highest priority queue (b)
+     */
+
+    // Total preemption = 1G per round, which is 5% of cluster resource (20G)
+    conf.setFloat(CapacitySchedulerConfiguration.TOTAL_PREEMPTION_PER_ROUND,
+        0.05f);
+    conf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 512);
+    conf.setPUOrderingPolicyUnderUtilizedPreemptionEnabled(true);
+    conf.setPUOrderingPolicyUnderUtilizedPreemptionDelay(1000);
+    conf.setQueueOrderingPolicy(CapacitySchedulerConfiguration.ROOT,
+        CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY);
+
+    // A/B has higher priority
+    conf.setQueuePriority(CapacitySchedulerConfiguration.ROOT + ".a", 1);
+    conf.setQueuePriority(CapacitySchedulerConfiguration.ROOT + ".b", 2);
+    conf.setCapacity(CapacitySchedulerConfiguration.ROOT + ".a", 45f);
+    conf.setCapacity(CapacitySchedulerConfiguration.ROOT + ".b", 45f);
+    conf.setCapacity(CapacitySchedulerConfiguration.ROOT + ".c", 10f);
+
+    MockRM rm1 = new MockRM(conf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+
+    MockNM[] mockNMs = new MockNM[5];
+    for (int i = 0; i < 5; i++) {
+      mockNMs[i] = rm1.registerNode("h" + i + ":1234", 4 * GB);
+    }
+
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+
+    RMNode[] rmNodes = new RMNode[5];
+    for (int i = 0; i < 5; i++) {
+      rmNodes[i] = rm1.getRMContext().getRMNodes().get(mockNMs[i].getNodeId());
+    }
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "c");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, mockNMs[4]);
+
+    am1.allocate("*", 1 * GB, 4, new ArrayList<>());
+
+    // Do allocation for nm1-nm8
+    for (int i = 0; i < 4; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+
+    // App1 should have 5 containers now, one for each node
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(5, schedulerApp1.getLiveContainers().size());
+    for (int i = 0; i < 5; i++) {
+      waitNumberOfLiveContainersOnNodeFromApp(cs.getNode(rmNodes[i].getNodeID()),
+          am1.getApplicationAttemptId(), 1);
+    }
+
+    // Submit app2 to queue-a and asks for a 0.5G container for AM (on n0)
+    RMApp app2 = rm1.submitApp(512, "app", "user", null, "a");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, mockNMs[0]);
+    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(
+        ApplicationAttemptId.newInstance(app2.getApplicationId(), 1));
+
+    // Ask 2 * 3.5GB containers
+    am2.allocate("*", 3 * GB + 512, 2, new ArrayList<>());
+
+    // Do allocation for n0-n1
+    for (int i = 0; i < 2; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+
+    // Check am2 reserved resource from nm0-nm1
+    for (int i = 0; i < 2; i++) {
+      Assert.assertNotNull("Should reserve on nm-" + i,
+          cs.getNode(rmNodes[i].getNodeID()).getReservedContainer());
+      Assert.assertEquals(cs.getNode(rmNodes[i].getNodeID())
+          .getReservedContainer().getQueueName(), "a");
+    }
+
+    // Submit app3 to queue-b and asks for a 0.5G container for AM (on n2)
+    RMApp app3 = rm1.submitApp(512, "app", "user", null, "b");
+    MockAM am3 = MockRM.launchAndRegisterAM(app3, rm1, mockNMs[2]);
+    FiCaSchedulerApp schedulerApp3 = cs.getApplicationAttempt(
+        ApplicationAttemptId.newInstance(app3.getApplicationId(), 1));
+
+    // Ask 2 * 3.5GB containers
+    am3.allocate("*", 3 * GB + 512, 2, new ArrayList<>());
+
+    // Do allocation for n2-n3
+    for (int i = 2; i < 4; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+
+    // Check am2 reserved resource from nm2-nm3
+    for (int i = 2; i < 4; i++) {
+      Assert.assertNotNull("Should reserve on nm-" + i,
+          cs.getNode(rmNodes[i].getNodeID()).getReservedContainer());
+      Assert.assertEquals(cs.getNode(rmNodes[i].getNodeID())
+          .getReservedContainer().getQueueName(), "b");
+    }
+
+    // Sleep the timeout interval, we should be able to see 1 container selected
+    Thread.sleep(1000);
+
+    /* 1st container preempted is on n2 */
+    ProportionalCapacityPreemptionPolicy editPolicy =
+        (ProportionalCapacityPreemptionPolicy) getSchedulingEditPolicy(rm1);
+    editPolicy.editSchedule();
+
+    // We should have one to-preempt container, on node[2]
+    Set<RMContainer> selectedToPreempt =
+        editPolicy.getToPreemptContainers().keySet();
+    Assert.assertEquals(1, selectedToPreempt.size());
+    Assert.assertEquals(mockNMs[2].getNodeId(),
+        selectedToPreempt.iterator().next().getAllocatedNode());
+
+    // Call editSchedule again: selected containers are killed
+    editPolicy.editSchedule();
+
+    // Do allocation for all nms
+    for (int i = 0; i < 4; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 4);
+
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 4);
+    waitNumberOfLiveContainersFromApp(schedulerApp2, 1);
+    waitNumberOfLiveContainersFromApp(schedulerApp3, 2);
+
+    /* 2nd container preempted is on n3 */
+    editPolicy.editSchedule();
+
+    // We should have one to-preempt container, on node[3]
+    selectedToPreempt =
+        editPolicy.getToPreemptContainers().keySet();
+    Assert.assertEquals(1, selectedToPreempt.size());
+    Assert.assertEquals(mockNMs[3].getNodeId(),
+        selectedToPreempt.iterator().next().getAllocatedNode());
+
+    // Call editSchedule again: selected containers are killed
+    editPolicy.editSchedule();
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 3);
+
+    // Do allocation for all nms
+    for (int i = 0; i < 4; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 3);
+    waitNumberOfLiveContainersFromApp(schedulerApp2, 1);
+    waitNumberOfLiveContainersFromApp(schedulerApp3, 3);
+
+    /* 3rd container preempted is on n0 */
+    editPolicy.editSchedule();
+
+    // We should have one to-preempt container, on node[0]
+    selectedToPreempt =
+        editPolicy.getToPreemptContainers().keySet();
+    Assert.assertEquals(1, selectedToPreempt.size());
+    Assert.assertEquals(mockNMs[0].getNodeId(),
+        selectedToPreempt.iterator().next().getAllocatedNode());
+
+    // Call editSchedule again: selected containers are killed
+    editPolicy.editSchedule();
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 2);
+
+    // Do allocation for all nms
+    for (int i = 0; i < 4; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 2);
+    waitNumberOfLiveContainersFromApp(schedulerApp2, 2);
+    waitNumberOfLiveContainersFromApp(schedulerApp3, 3);
+
+    /* 4th container preempted is on n1 */
+    editPolicy.editSchedule();
+
+    // We should have one to-preempt container, on node[0]
+    selectedToPreempt =
+        editPolicy.getToPreemptContainers().keySet();
+    Assert.assertEquals(1, selectedToPreempt.size());
+    Assert.assertEquals(mockNMs[1].getNodeId(),
+        selectedToPreempt.iterator().next().getAllocatedNode());
+
+    // Call editSchedule again: selected containers are killed
+    editPolicy.editSchedule();
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 1);
+
+    // Do allocation for all nms
+    for (int i = 0; i < 4; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNodes[i]));
+    }
+
+    waitNumberOfLiveContainersFromApp(schedulerApp1, 1);
+    waitNumberOfLiveContainersFromApp(schedulerApp2, 3);
+    waitNumberOfLiveContainersFromApp(schedulerApp3, 3);
+
+    rm1.close();
+  }
+
 }

+ 0 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestChildQueueOrder.java

@@ -99,9 +99,6 @@ public class TestChildQueueOrder {
         Resources.createResource(16*GB, 32));
     when(csContext.getClusterResource()).
         thenReturn(Resources.createResource(100 * 16 * GB, 100 * 32));
-    when(csContext.getNonPartitionedQueueComparator()).
-        thenReturn(
-            CapacitySchedulerQueueManager.NON_PARTITIONED_QUEUE_COMPARATOR);
     when(csContext.getResourceCalculator()).
         thenReturn(resourceComparator);
     when(csContext.getRMContext()).thenReturn(rmContext);

+ 111 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestContainerAllocation.java

@@ -776,4 +776,115 @@ public class TestContainerAllocation {
         Resources.createResource(20 * GB), "", true).getMemorySize());
     rm1.close();
   }
+
+
+  @Test(timeout = 60000)
+  public void testQueuePriorityOrdering() throws Exception {
+    CapacitySchedulerConfiguration newConf =
+        (CapacitySchedulerConfiguration) TestUtils
+            .getConfigurationWithMultipleQueues(conf);
+
+    // Set ordering policy
+    newConf.setQueueOrderingPolicy(CapacitySchedulerConfiguration.ROOT,
+        CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY);
+
+    // Set maximum capacity of A to 20
+    newConf.setMaximumCapacity(CapacitySchedulerConfiguration.ROOT + ".a", 20);
+    newConf.setQueuePriority(CapacitySchedulerConfiguration.ROOT + ".c", 1);
+    newConf.setQueuePriority(CapacitySchedulerConfiguration.ROOT + ".b", 2);
+    newConf.setQueuePriority(CapacitySchedulerConfiguration.ROOT + ".a", 3);
+
+    MockRM rm1 = new MockRM(newConf);
+
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+    MockNM nm1 = rm1.registerNode("h1:1234", 100 * GB);
+
+    // launch an app to queue A, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(2 * GB, "app", "user", null, "a");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    // launch an app to queue B, AM container should be launched in nm1
+    RMApp app2 = rm1.submitApp(2 * GB, "app", "user", null, "b");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm1);
+
+    // launch an app to queue C, AM container should be launched in nm1
+    RMApp app3 = rm1.submitApp(2 * GB, "app", "user", null, "c");
+    MockAM am3 = MockRM.launchAndRegisterAM(app3, rm1, nm1);
+
+    // Each application asks 10 * 5GB containers
+    am1.allocate("*", 5 * GB, 10, null);
+    am2.allocate("*", 5 * GB, 10, null);
+    am3.allocate("*", 5 * GB, 10, null);
+
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+
+    FiCaSchedulerApp schedulerApp1 =
+        cs.getApplicationAttempt(am1.getApplicationAttemptId());
+    FiCaSchedulerApp schedulerApp2 =
+        cs.getApplicationAttempt(am2.getApplicationAttemptId());
+    FiCaSchedulerApp schedulerApp3 =
+        cs.getApplicationAttempt(am3.getApplicationAttemptId());
+
+    // container will be allocated to am1
+    // App1 will get 2 container allocated (plus AM container)
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    Assert.assertEquals(2, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp2.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp3.getLiveContainers().size());
+
+    // container will be allocated to am1 again,
+    // App1 will get 3 container allocated (plus AM container)
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    Assert.assertEquals(3, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp2.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp3.getLiveContainers().size());
+
+    // (Now usages of queues: a=12G (satisfied), b=2G, c=2G)
+
+    // container will be allocated to am2 (since app1 reaches its guaranteed
+    // capacity)
+    // App2 will get 2 container allocated (plus AM container)
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    Assert.assertEquals(3, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(2, schedulerApp2.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp3.getLiveContainers().size());
+
+    // Do this 3 times
+    // container will be allocated to am2 (since app1 reaches its guaranteed
+    // capacity)
+    // App2 will get 2 container allocated (plus AM container)
+    for (int i = 0; i < 3; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    }
+    Assert.assertEquals(3, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(5, schedulerApp2.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp3.getLiveContainers().size());
+
+    // (Now usages of queues: a=12G (satisfied), b=22G (satisfied), c=2G))
+
+    // Do this 10 times
+    for (int i = 0; i < 10; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    }
+    Assert.assertEquals(3, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(5, schedulerApp2.getLiveContainers().size());
+    Assert.assertEquals(11, schedulerApp3.getLiveContainers().size());
+
+    // (Now usages of queues: a=12G (satisfied), b=22G (satisfied),
+    // c=52G (satisfied and no pending))
+
+    // Do this 20 times, we can only allocate 3 containers, 1 to A and 3 to B
+    for (int i = 0; i < 20; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    }
+    Assert.assertEquals(4, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(6, schedulerApp2.getLiveContainers().size());
+    Assert.assertEquals(11, schedulerApp3.getLiveContainers().size());
+
+    // (Now usages of queues: a=17G (satisfied), b=27G (satisfied), c=52G))
+
+    rm1.close();
+  }
 }

+ 5 - 8
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java

@@ -175,9 +175,6 @@ public class TestLeafQueue {
         thenReturn(Resources.createResource(16*GB, 32));
     when(csContext.getClusterResource()).
         thenReturn(Resources.createResource(100 * 16 * GB, 100 * 32));
-    when(csContext.getNonPartitionedQueueComparator()).
-        thenReturn(
-            CapacitySchedulerQueueManager.NON_PARTITIONED_QUEUE_COMPARATOR);
     when(csContext.getResourceCalculator()).
         thenReturn(resourceCalculator);
     when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
@@ -414,7 +411,7 @@ public class TestLeafQueue {
       "testPolicyRoot" + System.currentTimeMillis();
 
     OrderingPolicy<FiCaSchedulerApp> comPol =    
-      testConf.<FiCaSchedulerApp>getOrderingPolicy(tproot);
+      testConf.<FiCaSchedulerApp>getAppOrderingPolicy(tproot);
     
     
   }
@@ -489,16 +486,16 @@ public class TestLeafQueue {
       "testPolicyRoot" + System.currentTimeMillis();
 
     OrderingPolicy<FiCaSchedulerApp> schedOrder =
-      testConf.<FiCaSchedulerApp>getOrderingPolicy(tproot);
+      testConf.<FiCaSchedulerApp>getAppOrderingPolicy(tproot);
 
     //override default to fair
     String policyType = CapacitySchedulerConfiguration.PREFIX + tproot +
       "." + CapacitySchedulerConfiguration.ORDERING_POLICY;
 
     testConf.set(policyType,
-      CapacitySchedulerConfiguration.FAIR_ORDERING_POLICY);
+      CapacitySchedulerConfiguration.FAIR_APP_ORDERING_POLICY);
     schedOrder =
-      testConf.<FiCaSchedulerApp>getOrderingPolicy(tproot);
+      testConf.<FiCaSchedulerApp>getAppOrderingPolicy(tproot);
     FairOrderingPolicy fop = (FairOrderingPolicy<FiCaSchedulerApp>) schedOrder;
     assertFalse(fop.getSizeBasedWeight());
 
@@ -508,7 +505,7 @@ public class TestLeafQueue {
       FairOrderingPolicy.ENABLE_SIZE_BASED_WEIGHT;
     testConf.set(sbwConfig, "true");
     schedOrder =
-      testConf.<FiCaSchedulerApp>getOrderingPolicy(tproot);
+      testConf.<FiCaSchedulerApp>getAppOrderingPolicy(tproot);
     fop = (FairOrderingPolicy<FiCaSchedulerApp>) schedOrder;
     assertTrue(fop.getSizeBasedWeight());
 

+ 0 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestParentQueue.java

@@ -97,9 +97,6 @@ public class TestParentQueue {
         Resources.createResource(16*GB, 32));
     when(csContext.getClusterResource()).
         thenReturn(Resources.createResource(100 * 16 * GB, 100 * 32));
-    when(csContext.getNonPartitionedQueueComparator()).
-        thenReturn(
-            CapacitySchedulerQueueManager.NON_PARTITIONED_QUEUE_COMPARATOR);
     when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
     when(csContext.getResourceCalculator()).
         thenReturn(resourceComparator);

+ 0 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestReservations.java

@@ -133,8 +133,6 @@ public class TestReservations {
         Resources.createResource(16 * GB, 12));
     when(csContext.getClusterResource()).thenReturn(
         Resources.createResource(100 * 16 * GB, 100 * 12));
-    when(csContext.getNonPartitionedQueueComparator()).thenReturn(
-        CapacitySchedulerQueueManager.NON_PARTITIONED_QUEUE_COMPARATOR);
     when(csContext.getResourceCalculator()).thenReturn(resourceCalculator);
     when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
     when(csContext.getRMContext()).thenReturn(rmContext);

+ 222 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/policy/TestPriorityUtilizationQueueOrderingPolicy.java

@@ -0,0 +1,222 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.policy;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.ImmutableTable;
+import org.apache.hadoop.yarn.api.records.Priority;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+public class TestPriorityUtilizationQueueOrderingPolicy {
+  private List<CSQueue> mockCSQueues(String[] queueNames, int[] priorities,
+      float[] utilizations, String partition) {
+    // sanity check
+    assert queueNames != null && priorities != null && utilizations != null
+        && queueNames.length > 0 && queueNames.length == priorities.length
+        && priorities.length == utilizations.length;
+
+    List<CSQueue> list = new ArrayList<>();
+    for (int i = 0; i < queueNames.length; i++) {
+      CSQueue q = mock(CSQueue.class);
+      when(q.getQueueName()).thenReturn(queueNames[i]);
+
+      QueueCapacities qc = new QueueCapacities(false);
+      qc.setUsedCapacity(partition, utilizations[i]);
+
+      when(q.getQueueCapacities()).thenReturn(qc);
+      when(q.getPriority()).thenReturn(Priority.newInstance(priorities[i]));
+
+      list.add(q);
+    }
+
+    return list;
+  }
+
+  private void verifyOrder(QueueOrderingPolicy orderingPolicy, String partition,
+      String[] expectedOrder) {
+    Iterator<CSQueue> iter = orderingPolicy.getAssignmentIterator(partition);
+    int i = 0;
+    while (iter.hasNext()) {
+      CSQueue q = iter.next();
+      Assert.assertEquals(expectedOrder[i], q.getQueueName());
+      i++;
+    }
+
+    assert i == expectedOrder.length;
+  }
+
+  @Test
+  public void testUtilizationOrdering() {
+    PriorityUtilizationQueueOrderingPolicy policy =
+        new PriorityUtilizationQueueOrderingPolicy(false);
+
+    // Case 1, one queue
+    policy.setQueues(mockCSQueues(new String[] { "a" }, new int[] { 0 },
+        new float[] { 0.1f }, ""));
+    verifyOrder(policy, "", new String[] { "a" });
+
+    // Case 2, 2 queues
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 0, 0 },
+        new float[] { 0.1f, 0.0f }, ""));
+    verifyOrder(policy, "", new String[] { "b", "a" });
+
+    // Case 3, 3 queues
+    policy.setQueues(
+        mockCSQueues(new String[] { "a", "b", "c" }, new int[] { 0, 0, 0 },
+            new float[] { 0.1f, 0.0f, 0.2f }, ""));
+    verifyOrder(policy, "", new String[] { "b", "a", "c" });
+
+    // Case 4, 3 queues, ignore priority
+    policy.setQueues(
+        mockCSQueues(new String[] { "a", "b", "c" }, new int[] { 2, 1, 0 },
+            new float[] { 0.1f, 0.0f, 0.2f }, ""));
+    verifyOrder(policy, "", new String[] { "b", "a", "c" });
+
+    // Case 5, 3 queues, look at partition (default)
+    policy.setQueues(
+        mockCSQueues(new String[] { "a", "b", "c" }, new int[] { 2, 1, 0 },
+            new float[] { 0.1f, 0.0f, 0.2f }, "x"));
+    verifyOrder(policy, "", new String[] { "a", "b", "c" });
+
+    // Case 5, 3 queues, look at partition (x)
+    policy.setQueues(
+        mockCSQueues(new String[] { "a", "b", "c" }, new int[] { 2, 1, 0 },
+            new float[] { 0.1f, 0.0f, 0.2f }, "x"));
+    verifyOrder(policy, "x", new String[] { "b", "a", "c" });
+
+    // Case 6, 3 queues, with different accessibility to partition
+    List<CSQueue> queues = mockCSQueues(new String[] { "a", "b", "c" }, new int[] { 2, 1, 0 },
+        new float[] { 0.1f, 0.0f, 0.2f }, "x");
+    // a can access "x"
+    when(queues.get(0).getAccessibleNodeLabels()).thenReturn(ImmutableSet.of("x", "y"));
+    // c can access "x"
+    when(queues.get(2).getAccessibleNodeLabels()).thenReturn(ImmutableSet.of("x", "y"));
+    policy.setQueues(queues);
+    verifyOrder(policy, "x", new String[] { "a", "c", "b" });
+  }
+
+  @Test
+  public void testPriorityUtilizationOrdering() {
+    PriorityUtilizationQueueOrderingPolicy policy =
+        new PriorityUtilizationQueueOrderingPolicy(true);
+
+    // Case 1, one queue
+    policy.setQueues(mockCSQueues(new String[] { "a" }, new int[] { 1 },
+        new float[] { 0.1f }, ""));
+    verifyOrder(policy, "", new String[] { "a" });
+
+    // Case 2, 2 queues, both under utilized, same priority
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 1, 1 },
+        new float[] { 0.2f, 0.1f }, ""));
+    verifyOrder(policy, "", new String[] { "b", "a" });
+
+    // Case 3, 2 queues, both over utilized, same priority
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 1, 1 },
+        new float[] { 1.1f, 1.2f }, ""));
+    verifyOrder(policy, "", new String[] { "a", "b" });
+
+    // Case 4, 2 queues, one under and one over, same priority
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 1, 1 },
+        new float[] { 0.1f, 1.2f }, ""));
+    verifyOrder(policy, "", new String[] { "a", "b" });
+
+    // Case 5, 2 queues, both over utilized, different priority
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 1, 2 },
+        new float[] { 1.1f, 1.2f }, ""));
+    verifyOrder(policy, "", new String[] { "b", "a" });
+
+    // Case 6, 2 queues, both under utilized, different priority
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 1, 2 },
+        new float[] { 0.1f, 0.2f }, ""));
+    verifyOrder(policy, "", new String[] { "b", "a" });
+
+    // Case 7, 2 queues, one under utilized and one over utilized,
+    // different priority (1)
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 1, 2 },
+        new float[] { 0.1f, 1.2f }, ""));
+    verifyOrder(policy, "", new String[] { "a", "b" });
+
+    // Case 8, 2 queues, one under utilized and one over utilized,
+    // different priority (1)
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 2, 1 },
+        new float[] { 0.1f, 1.2f }, ""));
+    verifyOrder(policy, "", new String[] { "a", "b" });
+
+    // Case 9, 2 queues, one under utilized and one meet, different priority (1)
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 1, 2 },
+        new float[] { 0.1f, 1.0f }, ""));
+    verifyOrder(policy, "", new String[] { "a", "b" });
+
+    // Case 10, 2 queues, one under utilized and one meet, different priority (2)
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 2, 1 },
+        new float[] { 0.1f, 1.0f }, ""));
+    verifyOrder(policy, "", new String[] { "a", "b" });
+
+    // Case 11, 2 queues, one under utilized and one meet, same priority
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 1, 1 },
+        new float[] { 0.1f, 1.0f }, ""));
+    verifyOrder(policy, "", new String[] { "a", "b" });
+
+    // Case 12, 2 queues, both meet, different priority
+    policy.setQueues(mockCSQueues(new String[] { "a", "b" }, new int[] { 1, 2 },
+        new float[] { 1.0f, 1.0f }, ""));
+    verifyOrder(policy, "", new String[] { "b", "a" });
+
+    // Case 13, 5 queues, different priority
+    policy.setQueues(mockCSQueues(new String[] { "a", "b", "c", "d", "e" },
+        new int[] { 1, 2, 0, 0, 3 },
+        new float[] { 1.2f, 1.0f, 0.2f, 1.1f, 0.2f }, ""));
+    verifyOrder(policy, "", new String[] { "e", "c", "b", "a", "d" });
+
+    // Case 14, 5 queues, different priority, partition default;
+    policy.setQueues(mockCSQueues(new String[] { "a", "b", "c", "d", "e" },
+        new int[] { 1, 2, 0, 0, 3 },
+        new float[] { 1.2f, 1.0f, 0.2f, 1.1f, 0.2f }, "x"));
+    verifyOrder(policy, "", new String[] { "e", "b", "a", "c", "d" });
+
+    // Case 15, 5 queues, different priority, partition x;
+    policy.setQueues(mockCSQueues(new String[] { "a", "b", "c", "d", "e" },
+        new int[] { 1, 2, 0, 0, 3 },
+        new float[] { 1.2f, 1.0f, 0.2f, 1.1f, 0.2f }, "x"));
+    verifyOrder(policy, "x", new String[] { "e", "c", "b", "a", "d" });
+
+    // Case 16, 5 queues, different priority, partition x; and different
+    // accessibility
+    List<CSQueue> queues = mockCSQueues(new String[] { "a", "b", "c", "d", "e" },
+        new int[] { 1, 2, 0, 0, 3 },
+        new float[] { 1.2f, 1.0f, 0.2f, 1.1f, 0.2f }, "x");
+    // Only a/d has access to x
+    when(queues.get(0).getAccessibleNodeLabels()).thenReturn(
+        ImmutableSet.of("x"));
+    when(queues.get(3).getAccessibleNodeLabels()).thenReturn(
+        ImmutableSet.of("x"));
+    policy.setQueues(queues);
+    verifyOrder(policy, "x", new String[] { "a", "d", "e", "c", "b" });
+  }
+}

+ 1 - 10
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/policy/TestFairOrderingPolicy.java

@@ -20,23 +20,14 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy;
 
 import java.util.*;
 
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.ImmutableSet;
 import org.apache.hadoop.yarn.api.records.NodeId;
-import org.apache.hadoop.yarn.api.records.NodeLabel;
-import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
-import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
 import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
-import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
-import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
 import org.junit.Assert;
 import org.junit.Test;
 
-import org.apache.hadoop.yarn.api.records.Priority;
-import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.util.resource.Resources;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration;
 
@@ -157,7 +148,7 @@ public class TestFairOrderingPolicy {
 
     // Define top-level queues
     String queuePath = CapacitySchedulerConfiguration.ROOT + ".default";
-    csConf.setOrderingPolicy(queuePath, CapacitySchedulerConfiguration.FAIR_ORDERING_POLICY);
+    csConf.setOrderingPolicy(queuePath, CapacitySchedulerConfiguration.FAIR_APP_ORDERING_POLICY);
     csConf.setOrderingPolicyParameter(queuePath,
         FairOrderingPolicy.ENABLE_SIZE_BASED_WEIGHT, "true");
     csConf.setMaximumApplicationMasterResourcePerQueuePercent(queuePath, 0.1f);