Browse Source

YARN-4108. CapacityScheduler: Improve preemption to only kill containers that would satisfy the incoming request. (Wangda Tan)

(cherry picked from commit 7e8c9beb4156dcaeb3a11e60aaa06d2370626913)
Wangda Tan 9 years ago
parent
commit
c34072a890
37 changed files with 1774 additions and 181 deletions
  1. 120 46
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java
  2. 1 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainer.java
  3. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/PreemptableResourceScheduler.java
  4. 9 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ResourceLimits.java
  5. 3 4
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java
  6. 30 15
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java
  7. 11 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSAssignment.java
  8. 100 31
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
  9. 12 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java
  10. 5 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerContext.java
  11. 69 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
  12. 123 32
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
  13. 3 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java
  14. 12 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java
  15. 30 9
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
  16. 45 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/preemption/KillableContainer.java
  17. 102 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/preemption/PreemptableQueue.java
  18. 165 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/preemption/PreemptionManager.java
  19. 3 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/AssignmentInformation.java
  20. 19 13
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java
  21. 61 4
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerNode.java
  22. 10 5
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/SchedulerEventType.java
  23. 2 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMDispatcher.java
  24. 3 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java
  25. 4 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicy.java
  26. 2 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyForNodePartitions.java
  27. 2 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java
  28. 3 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationPriority.java
  29. 4 4
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
  30. 677 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerPreemption.java
  31. 2 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestChildQueueOrder.java
  32. 3 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java
  33. 97 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestNodeLabelContainerAllocation.java
  34. 2 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestParentQueue.java
  35. 2 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestReservations.java
  36. 36 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java
  37. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java

+ 120 - 46
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java

@@ -35,7 +35,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
-import org.apache.hadoop.yarn.api.records.NodeId;
+import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
@@ -49,6 +49,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.Capacity
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptableQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ContainerPreemptEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ContainerPreemptEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
@@ -125,8 +126,8 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
   private long maxWaitTime;
   private long maxWaitTime;
   private CapacityScheduler scheduler;
   private CapacityScheduler scheduler;
   private long monitoringInterval;
   private long monitoringInterval;
-  private final Map<RMContainer,Long> preempted =
-    new HashMap<RMContainer,Long>();
+  private final Map<RMContainer, Long> preempted = new HashMap<>();
+
   private ResourceCalculator rc;
   private ResourceCalculator rc;
   private float percentageClusterPreemptionAllowed;
   private float percentageClusterPreemptionAllowed;
   private double naturalTerminationFactor;
   private double naturalTerminationFactor;
@@ -135,6 +136,10 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
       new HashMap<>();
       new HashMap<>();
   private RMNodeLabelsManager nlm;
   private RMNodeLabelsManager nlm;
 
 
+  // Preemptable Entities, synced from scheduler at every run
+  private Map<String, PreemptableQueue> preemptableEntities = null;
+  private Set<ContainerId> killableContainers;
+
   public ProportionalCapacityPreemptionPolicy() {
   public ProportionalCapacityPreemptionPolicy() {
     clock = new SystemClock();
     clock = new SystemClock();
   }
   }
@@ -184,6 +189,64 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
     Resource clusterResources = Resources.clone(scheduler.getClusterResource());
     Resource clusterResources = Resources.clone(scheduler.getClusterResource());
     containerBasedPreemptOrKill(root, clusterResources);
     containerBasedPreemptOrKill(root, clusterResources);
   }
   }
+
+  @SuppressWarnings("unchecked")
+  private void cleanupStaledKillableContainers(Resource cluster,
+      Set<String> leafQueueNames) {
+    for (String q : leafQueueNames) {
+      for (TempQueuePerPartition tq : getQueuePartitions(q)) {
+        // When queue's used - killable <= guaranteed and, killable > 0, we need
+        // to check if any of killable containers needs to be reverted
+        if (Resources.lessThanOrEqual(rc, cluster,
+            Resources.subtract(tq.current, tq.killable), tq.idealAssigned)
+            && Resources.greaterThan(rc, cluster, tq.killable, Resources.none())) {
+          // How many killable resources need to be reverted
+          // need-to-revert = already-marked-killable - (current - ideal)
+          Resource toBeRevertedFromKillable = Resources.subtract(tq.killable,
+              Resources.subtract(tq.current, tq.idealAssigned));
+
+          Resource alreadyReverted = Resources.createResource(0);
+
+          for (RMContainer c : preemptableEntities.get(q).getKillableContainers(
+              tq.partition).values()) {
+            if (Resources.greaterThanOrEqual(rc, cluster, alreadyReverted,
+                toBeRevertedFromKillable)) {
+              break;
+            }
+
+            if (Resources.greaterThan(rc, cluster,
+                Resources.add(alreadyReverted, c.getAllocatedResource()),
+                toBeRevertedFromKillable)) {
+              continue;
+            } else {
+              // This container need to be marked to unkillable
+              Resources.addTo(alreadyReverted, c.getAllocatedResource());
+              rmContext.getDispatcher().getEventHandler().handle(
+                  new ContainerPreemptEvent(c.getApplicationAttemptId(), c,
+                      SchedulerEventType.MARK_CONTAINER_FOR_NONKILLABLE));
+            }
+          }
+
+        }
+      }
+    }
+  }
+
+  private void syncKillableContainersFromScheduler() {
+    // sync preemptable entities from scheduler
+    preemptableEntities =
+        scheduler.getPreemptionManager().getShallowCopyOfPreemptableEntities();
+
+    killableContainers = new HashSet<>();
+    for (Map.Entry<String, PreemptableQueue> entry : preemptableEntities
+        .entrySet()) {
+      PreemptableQueue entity = entry.getValue();
+      for (Map<ContainerId, RMContainer> map : entity.getKillableContainers()
+          .values()) {
+        killableContainers.addAll(map.keySet());
+      }
+    }
+  }
   
   
   /**
   /**
    * This method selects and tracks containers to be preempted. If a container
    * This method selects and tracks containers to be preempted. If a container
@@ -201,6 +264,8 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
         .getNodeLabelManager().getClusterNodeLabelNames());
         .getNodeLabelManager().getClusterNodeLabelNames());
     allPartitions.add(RMNodeLabelsManager.NO_LABEL);
     allPartitions.add(RMNodeLabelsManager.NO_LABEL);
 
 
+    syncKillableContainersFromScheduler();
+
     // extract a summary of the queues from scheduler
     // extract a summary of the queues from scheduler
     synchronized (scheduler) {
     synchronized (scheduler) {
       queueToPartitions.clear();
       queueToPartitions.clear();
@@ -228,13 +293,17 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
           recursivelyComputeIdealAssignment(tRoot, totalPreemptionAllowed);
           recursivelyComputeIdealAssignment(tRoot, totalPreemptionAllowed);
     }
     }
 
 
+    // remove containers from killable list when we want to preempt less resources
+    // from queue.
+    cleanupStaledKillableContainers(clusterResources, leafQueueNames);
+
     // based on ideal allocation select containers to be preempted from each
     // based on ideal allocation select containers to be preempted from each
     // queue and each application
     // queue and each application
     Map<ApplicationAttemptId,Set<RMContainer>> toPreempt =
     Map<ApplicationAttemptId,Set<RMContainer>> toPreempt =
         getContainersToPreempt(leafQueueNames, clusterResources);
         getContainersToPreempt(leafQueueNames, clusterResources);
 
 
     if (LOG.isDebugEnabled()) {
     if (LOG.isDebugEnabled()) {
-      logToCSV(new ArrayList<String>(leafQueueNames));
+      logToCSV(new ArrayList<>(leafQueueNames));
     }
     }
 
 
     // if we are in observeOnly mode return before any action is taken
     // if we are in observeOnly mode return before any action is taken
@@ -254,10 +323,10 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
         // if we tried to preempt this for more than maxWaitTime
         // if we tried to preempt this for more than maxWaitTime
         if (preempted.get(container) != null &&
         if (preempted.get(container) != null &&
             preempted.get(container) + maxWaitTime < clock.getTime()) {
             preempted.get(container) + maxWaitTime < clock.getTime()) {
-          // kill it
+          // mark container killable
           rmContext.getDispatcher().getEventHandler().handle(
           rmContext.getDispatcher().getEventHandler().handle(
               new ContainerPreemptEvent(appAttemptId, container,
               new ContainerPreemptEvent(appAttemptId, container,
-                  SchedulerEventType.KILL_PREEMPTED_CONTAINER));
+                  SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE));
           preempted.remove(container);
           preempted.remove(container);
         } else {
         } else {
           if (preempted.get(container) != null) {
           if (preempted.get(container) != null) {
@@ -333,14 +402,14 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
 
 
     // qAlloc tracks currently active queues (will decrease progressively as
     // qAlloc tracks currently active queues (will decrease progressively as
     // demand is met)
     // demand is met)
-    List<TempQueuePerPartition> qAlloc = new ArrayList<TempQueuePerPartition>(queues);
+    List<TempQueuePerPartition> qAlloc = new ArrayList<>(queues);
     // unassigned tracks how much resources are still to assign, initialized
     // unassigned tracks how much resources are still to assign, initialized
     // with the total capacity for this set of queues
     // with the total capacity for this set of queues
     Resource unassigned = Resources.clone(tot_guarant);
     Resource unassigned = Resources.clone(tot_guarant);
 
 
     // group queues based on whether they have non-zero guaranteed capacity
     // group queues based on whether they have non-zero guaranteed capacity
-    Set<TempQueuePerPartition> nonZeroGuarQueues = new HashSet<TempQueuePerPartition>();
-    Set<TempQueuePerPartition> zeroGuarQueues = new HashSet<TempQueuePerPartition>();
+    Set<TempQueuePerPartition> nonZeroGuarQueues = new HashSet<>();
+    Set<TempQueuePerPartition> zeroGuarQueues = new HashSet<>();
 
 
     for (TempQueuePerPartition q : qAlloc) {
     for (TempQueuePerPartition q : qAlloc) {
       if (Resources
       if (Resources
@@ -415,8 +484,8 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
     // idealAssigned >= current + pending), remove it from consideration.
     // idealAssigned >= current + pending), remove it from consideration.
     // Sort queues from most under-guaranteed to most over-guaranteed.
     // Sort queues from most under-guaranteed to most over-guaranteed.
     TQComparator tqComparator = new TQComparator(rc, tot_guarant);
     TQComparator tqComparator = new TQComparator(rc, tot_guarant);
-    PriorityQueue<TempQueuePerPartition> orderedByNeed =
-        new PriorityQueue<TempQueuePerPartition>(10, tqComparator);
+    PriorityQueue<TempQueuePerPartition> orderedByNeed = new PriorityQueue<>(10,
+        tqComparator);
     for (Iterator<TempQueuePerPartition> i = qAlloc.iterator(); i.hasNext();) {
     for (Iterator<TempQueuePerPartition> i = qAlloc.iterator(); i.hasNext();) {
       TempQueuePerPartition q = i.next();
       TempQueuePerPartition q = i.next();
       if (Resources.greaterThan(rc, tot_guarant, q.current, q.guaranteed)) {
       if (Resources.greaterThan(rc, tot_guarant, q.current, q.guaranteed)) {
@@ -474,7 +543,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
   // percentage of guaranteed.
   // percentage of guaranteed.
   protected Collection<TempQueuePerPartition> getMostUnderservedQueues(
   protected Collection<TempQueuePerPartition> getMostUnderservedQueues(
       PriorityQueue<TempQueuePerPartition> orderedByNeed, TQComparator tqComparator) {
       PriorityQueue<TempQueuePerPartition> orderedByNeed, TQComparator tqComparator) {
-    ArrayList<TempQueuePerPartition> underserved = new ArrayList<TempQueuePerPartition>();
+    ArrayList<TempQueuePerPartition> underserved = new ArrayList<>();
     while (!orderedByNeed.isEmpty()) {
     while (!orderedByNeed.isEmpty()) {
       TempQueuePerPartition q1 = orderedByNeed.remove();
       TempQueuePerPartition q1 = orderedByNeed.remove();
       underserved.add(q1);
       underserved.add(q1);
@@ -502,7 +571,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
     
     
     if (ignoreGuar) {
     if (ignoreGuar) {
       for (TempQueuePerPartition q : queues) {
       for (TempQueuePerPartition q : queues) {
-        q.normalizedGuarantee = (float)  1.0f / ((float) queues.size());
+        q.normalizedGuarantee = 1.0f / queues.size();
       }
       }
     } else {
     } else {
       for (TempQueuePerPartition q : queues) {
       for (TempQueuePerPartition q : queues) {
@@ -515,8 +584,9 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
     }
     }
   }
   }
 
 
-  private String getPartitionByNodeId(NodeId nodeId) {
-    return scheduler.getSchedulerNode(nodeId).getPartition();
+  private String getPartitionByRMContainer(RMContainer rmContainer) {
+    return scheduler.getSchedulerNode(rmContainer.getAllocatedNode())
+        .getPartition();
   }
   }
 
 
   /**
   /**
@@ -534,7 +604,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
       return false;
       return false;
     }
     }
 
 
-    String nodePartition = getPartitionByNodeId(rmContainer.getAllocatedNode());
+    String nodePartition = getPartitionByRMContainer(rmContainer);
     Resource toObtainByPartition =
     Resource toObtainByPartition =
         resourceToObtainByPartitions.get(nodePartition);
         resourceToObtainByPartitions.get(nodePartition);
 
 
@@ -575,7 +645,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
       ApplicationAttemptId appAttemptId, RMContainer containerToPreempt) {
       ApplicationAttemptId appAttemptId, RMContainer containerToPreempt) {
     Set<RMContainer> set;
     Set<RMContainer> set;
     if (null == (set = preemptMap.get(appAttemptId))) {
     if (null == (set = preemptMap.get(appAttemptId))) {
-      set = new HashSet<RMContainer>();
+      set = new HashSet<>();
       preemptMap.put(appAttemptId, set);
       preemptMap.put(appAttemptId, set);
     }
     }
     set.add(containerToPreempt);
     set.add(containerToPreempt);
@@ -587,7 +657,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
    * over-capacity queue. It uses {@link #NATURAL_TERMINATION_FACTOR} to
    * over-capacity queue. It uses {@link #NATURAL_TERMINATION_FACTOR} to
    * account for containers that will naturally complete.
    * account for containers that will naturally complete.
    *
    *
-   * @param queues set of leaf queues to preempt from
+   * @param leafQueueNames set of leaf queues to preempt from
    * @param clusterResource total amount of cluster resources
    * @param clusterResource total amount of cluster resources
    * @return a map of applciationID to set of containers to preempt
    * @return a map of applciationID to set of containers to preempt
    */
    */
@@ -595,8 +665,8 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
       Set<String> leafQueueNames, Resource clusterResource) {
       Set<String> leafQueueNames, Resource clusterResource) {
 
 
     Map<ApplicationAttemptId, Set<RMContainer>> preemptMap =
     Map<ApplicationAttemptId, Set<RMContainer>> preemptMap =
-        new HashMap<ApplicationAttemptId, Set<RMContainer>>();
-    List<RMContainer> skippedAMContainerlist = new ArrayList<RMContainer>();
+        new HashMap<>();
+    List<RMContainer> skippedAMContainerlist = new ArrayList<>();
 
 
     // Loop all leaf queues
     // Loop all leaf queues
     for (String queueName : leafQueueNames) {
     for (String queueName : leafQueueNames) {
@@ -614,7 +684,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
       LeafQueue leafQueue = null;
       LeafQueue leafQueue = null;
 
 
       Map<String, Resource> resToObtainByPartition =
       Map<String, Resource> resToObtainByPartition =
-          new HashMap<String, Resource>();
+          new HashMap<>();
       for (TempQueuePerPartition qT : getQueuePartitions(queueName)) {
       for (TempQueuePerPartition qT : getQueuePartitions(queueName)) {
         leafQueue = qT.leafQueue;
         leafQueue = qT.leafQueue;
         // we act only if we are violating balance by more than
         // we act only if we are violating balance by more than
@@ -703,7 +773,6 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
    * @param clusterResource
    * @param clusterResource
    * @param preemptMap
    * @param preemptMap
    * @param skippedAMContainerlist
    * @param skippedAMContainerlist
-   * @param resToObtain
    * @param skippedAMSize
    * @param skippedAMSize
    * @param maxAMCapacityForThisQueue
    * @param maxAMCapacityForThisQueue
    */
    */
@@ -751,7 +820,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
 
 
     // first drop reserved containers towards rsrcPreempt
     // first drop reserved containers towards rsrcPreempt
     List<RMContainer> reservedContainers =
     List<RMContainer> reservedContainers =
-        new ArrayList<RMContainer>(app.getReservedContainers());
+        new ArrayList<>(app.getReservedContainers());
     for (RMContainer c : reservedContainers) {
     for (RMContainer c : reservedContainers) {
       if (resToObtainByPartition.isEmpty()) {
       if (resToObtainByPartition.isEmpty()) {
         return;
         return;
@@ -771,8 +840,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
     // if more resources are to be freed go through all live containers in
     // if more resources are to be freed go through all live containers in
     // reverse priority and reverse allocation order and mark them for
     // reverse priority and reverse allocation order and mark them for
     // preemption
     // preemption
-    List<RMContainer> liveContainers =
-      new ArrayList<RMContainer>(app.getLiveContainers());
+    List<RMContainer> liveContainers = new ArrayList<>(app.getLiveContainers());
 
 
     sortContainers(liveContainers);
     sortContainers(liveContainers);
 
 
@@ -788,6 +856,11 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
         continue;
         continue;
       }
       }
 
 
+      // Skip already marked to killable containers
+      if (killableContainers.contains(c.getContainerId())) {
+        continue;
+      }
+
       // Try to preempt this container
       // Try to preempt this container
       tryPreemptContainerAndDeductResToObtain(resToObtainByPartition, c,
       tryPreemptContainerAndDeductResToObtain(resToObtainByPartition, c,
           clusterResource, preemptMap);
           clusterResource, preemptMap);
@@ -826,6 +899,10 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
     return "ProportionalCapacityPreemptionPolicy";
     return "ProportionalCapacityPreemptionPolicy";
   }
   }
 
 
+  @VisibleForTesting
+  public Map<RMContainer, Long> getToPreemptContainers() {
+    return preempted;
+  }
 
 
   /**
   /**
    * This method walks a tree of CSQueue and clones the portion of the state
    * This method walks a tree of CSQueue and clones the portion of the state
@@ -851,6 +928,11 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
           partitionToLookAt);
           partitionToLookAt);
       Resource guaranteed = Resources.multiply(partitionResource, absCap);
       Resource guaranteed = Resources.multiply(partitionResource, absCap);
       Resource maxCapacity = Resources.multiply(partitionResource, absMaxCap);
       Resource maxCapacity = Resources.multiply(partitionResource, absMaxCap);
+      Resource killable = Resources.none();
+      if (null != preemptableEntities.get(queueName)) {
+         killable = preemptableEntities.get(queueName)
+            .getKillableResource(partitionToLookAt);
+      }
 
 
       // when partition is a non-exclusive partition, the actual maxCapacity
       // when partition is a non-exclusive partition, the actual maxCapacity
       // could more than specified maxCapacity
       // could more than specified maxCapacity
@@ -875,7 +957,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
               l.getTotalPendingResourcesConsideringUserLimit(
               l.getTotalPendingResourcesConsideringUserLimit(
                   partitionResource, partitionToLookAt);
                   partitionResource, partitionToLookAt);
         ret = new TempQueuePerPartition(queueName, current, pending, guaranteed,
         ret = new TempQueuePerPartition(queueName, current, pending, guaranteed,
-            maxCapacity, preemptionDisabled, partitionToLookAt);
+            maxCapacity, preemptionDisabled, partitionToLookAt, killable);
         if (preemptionDisabled) {
         if (preemptionDisabled) {
           ret.untouchableExtra = extra;
           ret.untouchableExtra = extra;
         } else {
         } else {
@@ -886,7 +968,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
         Resource pending = Resource.newInstance(0, 0);
         Resource pending = Resource.newInstance(0, 0);
         ret =
         ret =
             new TempQueuePerPartition(curQueue.getQueueName(), current, pending,
             new TempQueuePerPartition(curQueue.getQueueName(), current, pending,
-                guaranteed, maxCapacity, false, partitionToLookAt);
+                guaranteed, maxCapacity, false, partitionToLookAt, killable);
         Resource childrensPreemptable = Resource.newInstance(0, 0);
         Resource childrensPreemptable = Resource.newInstance(0, 0);
         for (CSQueue c : curQueue.getChildQueues()) {
         for (CSQueue c : curQueue.getChildQueues()) {
           TempQueuePerPartition subq =
           TempQueuePerPartition subq =
@@ -932,7 +1014,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
 
 
     Map<String, TempQueuePerPartition> queuePartitions;
     Map<String, TempQueuePerPartition> queuePartitions;
     if (null == (queuePartitions = queueToPartitions.get(queueName))) {
     if (null == (queuePartitions = queueToPartitions.get(queueName))) {
-      queuePartitions = new HashMap<String, TempQueuePerPartition>();
+      queuePartitions = new HashMap<>();
       queueToPartitions.put(queueName, queuePartitions);
       queueToPartitions.put(queueName, queuePartitions);
     }
     }
     queuePartitions.put(queuePartition.partition, queuePartition);
     queuePartitions.put(queuePartition.partition, queuePartition);
@@ -971,8 +1053,10 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
     final Resource guaranteed;
     final Resource guaranteed;
     final Resource maxCapacity;
     final Resource maxCapacity;
     final String partition;
     final String partition;
+    final Resource killable;
     Resource idealAssigned;
     Resource idealAssigned;
     Resource toBePreempted;
     Resource toBePreempted;
+
     // For logging purpose
     // For logging purpose
     Resource actuallyPreempted;
     Resource actuallyPreempted;
     Resource untouchableExtra;
     Resource untouchableExtra;
@@ -986,7 +1070,7 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
 
 
     TempQueuePerPartition(String queueName, Resource current, Resource pending,
     TempQueuePerPartition(String queueName, Resource current, Resource pending,
         Resource guaranteed, Resource maxCapacity, boolean preemptionDisabled,
         Resource guaranteed, Resource maxCapacity, boolean preemptionDisabled,
-        String partition) {
+        String partition, Resource killableResource) {
       this.queueName = queueName;
       this.queueName = queueName;
       this.current = current;
       this.current = current;
       this.pending = pending;
       this.pending = pending;
@@ -996,11 +1080,12 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
       this.actuallyPreempted = Resource.newInstance(0, 0);
       this.actuallyPreempted = Resource.newInstance(0, 0);
       this.toBePreempted = Resource.newInstance(0, 0);
       this.toBePreempted = Resource.newInstance(0, 0);
       this.normalizedGuarantee = Float.NaN;
       this.normalizedGuarantee = Float.NaN;
-      this.children = new ArrayList<TempQueuePerPartition>();
+      this.children = new ArrayList<>();
       this.untouchableExtra = Resource.newInstance(0, 0);
       this.untouchableExtra = Resource.newInstance(0, 0);
       this.preemptableExtra = Resource.newInstance(0, 0);
       this.preemptableExtra = Resource.newInstance(0, 0);
       this.preemptionDisabled = preemptionDisabled;
       this.preemptionDisabled = preemptionDisabled;
       this.partition = partition;
       this.partition = partition;
+      this.killable = killableResource;
     }
     }
 
 
     public void setLeafQueue(LeafQueue l){
     public void setLeafQueue(LeafQueue l){
@@ -1018,12 +1103,6 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
       Resources.addTo(pending, q.pending);
       Resources.addTo(pending, q.pending);
     }
     }
 
 
-    public void addChildren(ArrayList<TempQueuePerPartition> queues) {
-      assert leafQueue == null;
-      children.addAll(queues);
-    }
-
-
     public ArrayList<TempQueuePerPartition> getChildren(){
     public ArrayList<TempQueuePerPartition> getChildren(){
       return children;
       return children;
     }
     }
@@ -1064,18 +1143,13 @@ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolic
       return sb.toString();
       return sb.toString();
     }
     }
 
 
-    public void printAll() {
-      LOG.info(this.toString());
-      for (TempQueuePerPartition sub : this.getChildren()) {
-        sub.printAll();
-      }
-    }
-
     public void assignPreemption(float scalingFactor,
     public void assignPreemption(float scalingFactor,
         ResourceCalculator rc, Resource clusterResource) {
         ResourceCalculator rc, Resource clusterResource) {
-      if (Resources.greaterThan(rc, clusterResource, current, idealAssigned)) {
-          toBePreempted = Resources.multiply(
-              Resources.subtract(current, idealAssigned), scalingFactor);
+      if (Resources.greaterThan(rc, clusterResource,
+          Resources.subtract(current, killable), idealAssigned)) {
+        toBePreempted = Resources.multiply(Resources.subtract(
+            Resources.subtract(current, killable), idealAssigned),
+            scalingFactor);
       } else {
       } else {
         toBePreempted = Resource.newInstance(0, 0);
         toBePreempted = Resource.newInstance(0, 0);
       }
       }

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainer.java

@@ -30,6 +30,7 @@ import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.ResourceRequest;
 import org.apache.hadoop.yarn.api.records.ResourceRequest;
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.event.EventHandler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue;
 
 
 /**
 /**
  * Represents the ResourceManager's view of an application container. See 
  * Represents the ResourceManager's view of an application container. See 

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/PreemptableResourceScheduler.java

@@ -45,6 +45,6 @@ public interface PreemptableResourceScheduler extends ResourceScheduler {
    * Ask the scheduler to forcibly interrupt the container given as input
    * Ask the scheduler to forcibly interrupt the container given as input
    * @param container
    * @param container
    */
    */
-  void killPreemptedContainer(RMContainer container);
+  void markContainerForKillable(RMContainer container);
 
 
 }
 }

+ 9 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ResourceLimits.java

@@ -38,6 +38,8 @@ public class ResourceLimits {
   // containers.
   // containers.
   private volatile Resource headroom;
   private volatile Resource headroom;
 
 
+  private boolean allowPreempt = false;
+
   public ResourceLimits(Resource limit) {
   public ResourceLimits(Resource limit) {
     this(limit, Resources.none());
     this(limit, Resources.none());
   }
   }
@@ -72,4 +74,11 @@ public class ResourceLimits {
     this.amountNeededUnreserve = amountNeededUnreserve;
     this.amountNeededUnreserve = amountNeededUnreserve;
   }
   }
 
 
+  public boolean isAllowPreemption() {
+    return allowPreempt;
+  }
+
+  public void setIsAllowPreemption(boolean allowPreempt) {
+   this.allowPreempt = allowPreempt;
+  }
 }
 }

+ 3 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java

@@ -64,9 +64,8 @@ public abstract class SchedulerNode {
   private volatile ResourceUtilization nodeUtilization =
   private volatile ResourceUtilization nodeUtilization =
       ResourceUtilization.newInstance(0, 0, 0f);
       ResourceUtilization.newInstance(0, 0, 0f);
 
 
-
   /* set of containers that are allocated containers */
   /* set of containers that are allocated containers */
-  private final Map<ContainerId, RMContainer> launchedContainers =
+  protected final Map<ContainerId, RMContainer> launchedContainers =
       new HashMap<ContainerId, RMContainer>();
       new HashMap<ContainerId, RMContainer>();
 
 
   private final RMNode rmNode;
   private final RMNode rmNode;
@@ -163,7 +162,7 @@ public abstract class SchedulerNode {
         + " available after allocation");
         + " available after allocation");
   }
   }
   
   
-  private synchronized void changeContainerResource(ContainerId containerId,
+  protected synchronized void changeContainerResource(ContainerId containerId,
       Resource deltaResource, boolean increase) {
       Resource deltaResource, boolean increase) {
     if (increase) {
     if (increase) {
       deductAvailableResource(deltaResource);
       deductAvailableResource(deltaResource);
@@ -228,7 +227,7 @@ public abstract class SchedulerNode {
     return false;
     return false;
   }
   }
 
 
-  private synchronized void updateResource(Container container) {
+  protected synchronized void updateResource(Container container) {
     addAvailableResource(container.getResource());
     addAvailableResource(container.getResource());
     --numContainers;
     --numContainers;
   }
   }

+ 30 - 15
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java

@@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
 import java.io.IOException;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.Map;
 import java.util.Set;
 import java.util.Set;
 
 
@@ -44,6 +45,7 @@ import org.apache.hadoop.yarn.security.PrivilegedEntity;
 import org.apache.hadoop.yarn.security.PrivilegedEntity.EntityType;
 import org.apache.hadoop.yarn.security.PrivilegedEntity.EntityType;
 import org.apache.hadoop.yarn.security.YarnAuthorizationProvider;
 import org.apache.hadoop.yarn.security.YarnAuthorizationProvider;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
@@ -435,11 +437,8 @@ public abstract class AbstractCSQueue implements CSQueue {
        */
        */
       Resource queueMaxResource =
       Resource queueMaxResource =
           getQueueMaxResource(nodePartition, clusterResource);
           getQueueMaxResource(nodePartition, clusterResource);
-      if (nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) {
-        return Resources.min(resourceCalculator, clusterResource,
-            queueMaxResource, currentResourceLimits.getLimit());
-      }
-      return queueMaxResource;
+      return Resources.min(resourceCalculator, clusterResource,
+          queueMaxResource, currentResourceLimits.getLimit());
     } else if (schedulingMode == SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY) {
     } else if (schedulingMode == SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY) {
       // When we doing non-exclusive resource allocation, maximum capacity of
       // When we doing non-exclusive resource allocation, maximum capacity of
       // all queues on this label equals to total resource with the label.
       // all queues on this label equals to total resource with the label.
@@ -476,12 +475,19 @@ public abstract class AbstractCSQueue implements CSQueue {
 
 
     Resource nowTotalUsed = queueUsage.getUsed(nodePartition);
     Resource nowTotalUsed = queueUsage.getUsed(nodePartition);
 
 
-    // Set headroom for currentResourceLimits
-    currentResourceLimits.setHeadroom(Resources.subtract(currentLimitResource,
-        nowTotalUsed));
+    // Set headroom for currentResourceLimits:
+    // When queue is a parent queue: Headroom = limit - used + killable
+    // When queue is a leaf queue: Headroom = limit - used (leaf queue cannot preempt itself)
+    Resource usedExceptKillable = nowTotalUsed;
+    if (null != getChildQueues() && !getChildQueues().isEmpty()) {
+      usedExceptKillable = Resources.subtract(nowTotalUsed,
+          getTotalKillableResource(nodePartition));
+    }
+    currentResourceLimits.setHeadroom(
+        Resources.subtract(currentLimitResource, usedExceptKillable));
 
 
     if (Resources.greaterThanOrEqual(resourceCalculator, clusterResource,
     if (Resources.greaterThanOrEqual(resourceCalculator, clusterResource,
-        nowTotalUsed, currentLimitResource)) {
+        usedExceptKillable, currentLimitResource)) {
 
 
       // if reservation continous looking enabled, check to see if could we
       // if reservation continous looking enabled, check to see if could we
       // potentially use this node instead of a reserved node if the application
       // potentially use this node instead of a reserved node if the application
@@ -493,7 +499,7 @@ public abstract class AbstractCSQueue implements CSQueue {
               resourceCouldBeUnreserved, Resources.none())) {
               resourceCouldBeUnreserved, Resources.none())) {
         // resource-without-reserved = used - reserved
         // resource-without-reserved = used - reserved
         Resource newTotalWithoutReservedResource =
         Resource newTotalWithoutReservedResource =
-            Resources.subtract(nowTotalUsed, resourceCouldBeUnreserved);
+            Resources.subtract(usedExceptKillable, resourceCouldBeUnreserved);
 
 
         // when total-used-without-reserved-resource < currentLimit, we still
         // when total-used-without-reserved-resource < currentLimit, we still
         // have chance to allocate on this node by unreserving some containers
         // have chance to allocate on this node by unreserving some containers
@@ -628,11 +634,10 @@ public abstract class AbstractCSQueue implements CSQueue {
     // considering all labels in cluster, only those labels which are
     // considering all labels in cluster, only those labels which are
     // use some resource of this queue can be considered.
     // use some resource of this queue can be considered.
     Set<String> nodeLabels = new HashSet<String>();
     Set<String> nodeLabels = new HashSet<String>();
-    if (this.getAccessibleNodeLabels() != null
-        && this.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) {
-      nodeLabels.addAll(Sets.union(this.getQueueCapacities()
-          .getNodePartitionsSet(), this.getQueueResourceUsage()
-          .getNodePartitionsSet()));
+    if (this.getAccessibleNodeLabels() != null && this.getAccessibleNodeLabels()
+        .contains(RMNodeLabelsManager.ANY)) {
+      nodeLabels.addAll(Sets.union(this.getQueueCapacities().getNodePartitionsSet(),
+          this.getQueueResourceUsage().getNodePartitionsSet()));
     } else {
     } else {
       nodeLabels.addAll(this.getAccessibleNodeLabels());
       nodeLabels.addAll(this.getAccessibleNodeLabels());
     }
     }
@@ -644,4 +649,14 @@ public abstract class AbstractCSQueue implements CSQueue {
     }
     }
     return nodeLabels;
     return nodeLabels;
   }
   }
+
+  public Resource getTotalKillableResource(String partition) {
+    return csContext.getPreemptionManager().getKillableResource(queueName,
+        partition);
+  }
+
+  public Iterator<RMContainer> getKillableContainers(String partition) {
+    return csContext.getPreemptionManager().getKillableContainers(queueName,
+        partition);
+  }
 }
 }

+ 11 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSAssignment.java

@@ -26,6 +26,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.Assignment
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.util.resource.Resources;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
 
+import java.util.List;
+
 @Private
 @Private
 @Unstable
 @Unstable
 public class CSAssignment {
 public class CSAssignment {
@@ -51,6 +53,7 @@ public class CSAssignment {
   private boolean fulfilledReservation;
   private boolean fulfilledReservation;
   private final AssignmentInformation assignmentInformation;
   private final AssignmentInformation assignmentInformation;
   private boolean increaseAllocation;
   private boolean increaseAllocation;
+  private List<RMContainer> containersToKill;
 
 
   public CSAssignment(Resource resource, NodeType type) {
   public CSAssignment(Resource resource, NodeType type) {
     this(resource, type, null, null, SkippedType.NONE, false);
     this(resource, type, null, null, SkippedType.NONE, false);
@@ -160,4 +163,12 @@ public class CSAssignment {
   public void setIncreasedAllocation(boolean flag) {
   public void setIncreasedAllocation(boolean flag) {
     increaseAllocation = flag;
     increaseAllocation = flag;
   }
   }
+
+  public void setContainersToKill(List<RMContainer> containersToKill) {
+    this.containersToKill = containersToKill;
+  }
+
+  public List<RMContainer> getContainersToKill() {
+    return containersToKill;
+  }
 }
 }

+ 100 - 31
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java

@@ -94,6 +94,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerDynamicE
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.KillableContainer;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.AssignmentInformation;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.AssignmentInformation;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
@@ -149,6 +151,10 @@ public class CapacityScheduler extends
   // timeout to join when we stop this service
   // timeout to join when we stop this service
   protected final long THREAD_JOIN_TIMEOUT_MS = 1000;
   protected final long THREAD_JOIN_TIMEOUT_MS = 1000;
 
 
+  private PreemptionManager preemptionManager = new PreemptionManager();
+
+  private volatile boolean isLazyPreemptionEnabled = false;
+
   static final Comparator<CSQueue> nonPartitionedQueueComparator =
   static final Comparator<CSQueue> nonPartitionedQueueComparator =
       new Comparator<CSQueue>() {
       new Comparator<CSQueue>() {
     @Override
     @Override
@@ -301,12 +307,11 @@ public class CapacityScheduler extends
     initMaximumResourceCapability(this.conf.getMaximumAllocation());
     initMaximumResourceCapability(this.conf.getMaximumAllocation());
     this.calculator = this.conf.getResourceCalculator();
     this.calculator = this.conf.getResourceCalculator();
     this.usePortForNodeName = this.conf.getUsePortForNodeName();
     this.usePortForNodeName = this.conf.getUsePortForNodeName();
-    this.applications =
-        new ConcurrentHashMap<ApplicationId,
-            SchedulerApplication<FiCaSchedulerApp>>();
+    this.applications = new ConcurrentHashMap<>();
     this.labelManager = rmContext.getNodeLabelManager();
     this.labelManager = rmContext.getNodeLabelManager();
     authorizer = YarnAuthorizationProvider.getInstance(yarnConf);
     authorizer = YarnAuthorizationProvider.getInstance(yarnConf);
     initializeQueues(this.conf);
     initializeQueues(this.conf);
+    this.isLazyPreemptionEnabled = conf.getLazyPreemptionEnabled();
 
 
     scheduleAsynchronously = this.conf.getScheduleAynschronously();
     scheduleAsynchronously = this.conf.getScheduleAynschronously();
     asyncScheduleInterval =
     asyncScheduleInterval =
@@ -372,6 +377,9 @@ public class CapacityScheduler extends
       refreshMaximumAllocation(this.conf.getMaximumAllocation());
       refreshMaximumAllocation(this.conf.getMaximumAllocation());
       throw new IOException("Failed to re-init queues", t);
       throw new IOException("Failed to re-init queues", t);
     }
     }
+
+    // update lazy preemption
+    this.isLazyPreemptionEnabled = this.conf.getLazyPreemptionEnabled();
   }
   }
   
   
   long getAsyncScheduleInterval() {
   long getAsyncScheduleInterval() {
@@ -506,6 +514,9 @@ public class CapacityScheduler extends
     LOG.info("Initialized root queue " + root);
     LOG.info("Initialized root queue " + root);
     updatePlacementRules();
     updatePlacementRules();
     setQueueAcls(authorizer, queues);
     setQueueAcls(authorizer, queues);
+
+    // Notify Preemption Manager
+    preemptionManager.refreshQueues(null, root);
   }
   }
 
 
   @Lock(CapacityScheduler.class)
   @Lock(CapacityScheduler.class)
@@ -533,6 +544,9 @@ public class CapacityScheduler extends
 
 
     labelManager.reinitializeQueueLabels(getQueueToLabels());
     labelManager.reinitializeQueueLabels(getQueueToLabels());
     setQueueAcls(authorizer, queues);
     setQueueAcls(authorizer, queues);
+
+    // Notify Preemption Manager
+    preemptionManager.refreshQueues(null, root);
   }
   }
 
 
   @VisibleForTesting
   @VisibleForTesting
@@ -1244,8 +1258,9 @@ public class CapacityScheduler extends
 
 
     // Try to schedule more if there are no reservations to fulfill
     // Try to schedule more if there are no reservations to fulfill
     if (node.getReservedContainer() == null) {
     if (node.getReservedContainer() == null) {
-      if (calculator.computeAvailableContainers(node.getAvailableResource(),
-        minimumAllocation) > 0) {
+      if (calculator.computeAvailableContainers(Resources
+              .add(node.getAvailableResource(), node.getTotalKillableResources()),
+          minimumAllocation) > 0) {
         if (LOG.isDebugEnabled()) {
         if (LOG.isDebugEnabled()) {
           LOG.debug("Trying to schedule on node: " + node.getNodeName() +
           LOG.debug("Trying to schedule on node: " + node.getNodeName() +
               ", available: " + node.getAvailableResource());
               ", available: " + node.getAvailableResource());
@@ -1254,10 +1269,8 @@ public class CapacityScheduler extends
         assignment = root.assignContainers(
         assignment = root.assignContainers(
             clusterResource,
             clusterResource,
             node,
             node,
-            // TODO, now we only consider limits for parent for non-labeled
-            // resources, should consider labeled resources as well.
             new ResourceLimits(labelManager.getResourceByLabel(
             new ResourceLimits(labelManager.getResourceByLabel(
-                RMNodeLabelsManager.NO_LABEL, clusterResource)),
+                node.getPartition(), clusterResource)),
             SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
             SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
         if (Resources.greaterThan(calculator, clusterResource,
         if (Resources.greaterThan(calculator, clusterResource,
             assignment.getResource(), Resources.none())) {
             assignment.getResource(), Resources.none())) {
@@ -1427,11 +1440,20 @@ public class CapacityScheduler extends
       markContainerForPreemption(aid, containerToBePreempted);
       markContainerForPreemption(aid, containerToBePreempted);
     }
     }
     break;
     break;
-    case KILL_PREEMPTED_CONTAINER:
+    case MARK_CONTAINER_FOR_KILLABLE:
     {
     {
-      ContainerPreemptEvent killContainerEvent = (ContainerPreemptEvent)event;
-      RMContainer containerToBeKilled = killContainerEvent.getContainer();
-      killPreemptedContainer(containerToBeKilled);
+      ContainerPreemptEvent containerKillableEvent = (ContainerPreemptEvent)event;
+      RMContainer killableContainer = containerKillableEvent.getContainer();
+      markContainerForKillable(killableContainer);
+    }
+    break;
+    case MARK_CONTAINER_FOR_NONKILLABLE:
+    {
+      if (isLazyPreemptionEnabled) {
+        ContainerPreemptEvent cancelKillContainerEvent =
+            (ContainerPreemptEvent) event;
+        markContainerForNonKillable(cancelKillContainerEvent.getContainer());
+      }
     }
     }
     break;
     break;
     default:
     default:
@@ -1540,14 +1562,14 @@ public class CapacityScheduler extends
   protected void completedContainerInternal(
   protected void completedContainerInternal(
       RMContainer rmContainer, ContainerStatus containerStatus,
       RMContainer rmContainer, ContainerStatus containerStatus,
       RMContainerEventType event) {
       RMContainerEventType event) {
-    
     Container container = rmContainer.getContainer();
     Container container = rmContainer.getContainer();
+    ContainerId containerId = container.getId();
     
     
     // Get the application for the finished container
     // Get the application for the finished container
     FiCaSchedulerApp application =
     FiCaSchedulerApp application =
         getCurrentAttemptForContainer(container.getId());
         getCurrentAttemptForContainer(container.getId());
     ApplicationId appId =
     ApplicationId appId =
-        container.getId().getApplicationAttemptId().getApplicationId();
+        containerId.getApplicationAttemptId().getApplicationId();
     if (application == null) {
     if (application == null) {
       LOG.info("Container " + container + " of" + " finished application "
       LOG.info("Container " + container + " of" + " finished application "
           + appId + " completed with event " + event);
           + appId + " completed with event " + event);
@@ -1561,15 +1583,6 @@ public class CapacityScheduler extends
     LeafQueue queue = (LeafQueue)application.getQueue();
     LeafQueue queue = (LeafQueue)application.getQueue();
     queue.completedContainer(clusterResource, application, node, 
     queue.completedContainer(clusterResource, application, node, 
         rmContainer, containerStatus, event, null, true);
         rmContainer, containerStatus, event, null, true);
-
-    if (containerStatus.getExitStatus() == ContainerExitStatus.PREEMPTED) {
-      schedulerHealth.updatePreemption(Time.now(), container.getNodeId(),
-        container.getId(), queue.getQueuePath());
-      schedulerHealth.updateSchedulerPreemptionCounts(1);
-    } else {
-      schedulerHealth.updateRelease(lastNodeUpdateTime, container.getNodeId(),
-        container.getId(), queue.getQueuePath());
-    }
   }
   }
   
   
   @Override
   @Override
@@ -1605,7 +1618,7 @@ public class CapacityScheduler extends
       ApplicationAttemptId applicationAttemptId) {
       ApplicationAttemptId applicationAttemptId) {
     return super.getApplicationAttempt(applicationAttemptId);
     return super.getApplicationAttempt(applicationAttemptId);
   }
   }
-  
+
   @Lock(Lock.NoLock.class)
   @Lock(Lock.NoLock.class)
   public FiCaSchedulerNode getNode(NodeId nodeId) {
   public FiCaSchedulerNode getNode(NodeId nodeId) {
     return nodes.get(nodeId);
     return nodes.get(nodeId);
@@ -1651,15 +1664,60 @@ public class CapacityScheduler extends
     }
     }
   }
   }
 
 
-  @Override
-  public void killPreemptedContainer(RMContainer cont) {
+  public synchronized void markContainerForKillable(
+      RMContainer killableContainer) {
+    if (LOG.isDebugEnabled()) {
+      LOG.debug(SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE + ": container"
+          + killableContainer.toString());
+    }
+
+    if (!isLazyPreemptionEnabled) {
+      super.completedContainer(killableContainer, SchedulerUtils
+          .createPreemptedContainerStatus(killableContainer.getContainerId(),
+              SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL);
+    } else {
+      FiCaSchedulerNode node = (FiCaSchedulerNode) getSchedulerNode(
+          killableContainer.getAllocatedNode());
+
+      FiCaSchedulerApp application = getCurrentAttemptForContainer(
+          killableContainer.getContainerId());
+
+      node.markContainerToKillable(killableContainer.getContainerId());
+
+      // notify PreemptionManager
+      // Get the application for the finished container
+      if (null != application) {
+        String leafQueueName = application.getCSLeafQueue().getQueueName();
+        getPreemptionManager().addKillableContainer(
+            new KillableContainer(killableContainer, node.getPartition(),
+                leafQueueName));
+      }    }
+  }
+
+  private synchronized void markContainerForNonKillable(
+      RMContainer nonKillableContainer) {
     if (LOG.isDebugEnabled()) {
     if (LOG.isDebugEnabled()) {
-      LOG.debug(SchedulerEventType.KILL_PREEMPTED_CONTAINER + ": container"
-          + cont.toString());
+      LOG.debug(
+          SchedulerEventType.MARK_CONTAINER_FOR_NONKILLABLE + ": container"
+              + nonKillableContainer.toString());
+    }
+
+    FiCaSchedulerNode node = (FiCaSchedulerNode) getSchedulerNode(
+        nonKillableContainer.getAllocatedNode());
+
+    FiCaSchedulerApp application = getCurrentAttemptForContainer(
+        nonKillableContainer.getContainerId());
+
+    node.markContainerToNonKillable(nonKillableContainer.getContainerId());
+
+    // notify PreemptionManager
+    // Get the application for the finished container
+    if (null != application) {
+      String leafQueueName = application.getCSLeafQueue().getQueueName();
+      getPreemptionManager().removeKillableContainer(
+          new KillableContainer(nonKillableContainer, node.getPartition(),
+              leafQueueName));
     }
     }
-    super.completedContainer(cont, SchedulerUtils
-        .createPreemptedContainerStatus(cont.getContainerId(),
-        SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL);
   }
   }
 
 
   @Override
   @Override
@@ -1976,6 +2034,7 @@ public class CapacityScheduler extends
     return ret;
     return ret;
   }
   }
 
 
+  @Override
   public SchedulerHealth getSchedulerHealth() {
   public SchedulerHealth getSchedulerHealth() {
     return this.schedulerHealth;
     return this.schedulerHealth;
   }
   }
@@ -1984,6 +2043,11 @@ public class CapacityScheduler extends
     this.lastNodeUpdateTime = time;
     this.lastNodeUpdateTime = time;
   }
   }
 
 
+  @Override
+  public long getLastNodeUpdateTime() {
+    return lastNodeUpdateTime;
+  }
+
   @Override
   @Override
   public Priority checkAndGetApplicationPriority(Priority priorityFromContext,
   public Priority checkAndGetApplicationPriority(Priority priorityFromContext,
       String user, String queueName, ApplicationId applicationId)
       String user, String queueName, ApplicationId applicationId)
@@ -2095,4 +2159,9 @@ public class CapacityScheduler extends
   public ResourceUsage getClusterResourceUsage() {
   public ResourceUsage getClusterResourceUsage() {
     return root.getQueueResourceUsage();
     return root.getQueueResourceUsage();
   }
   }
+
+  @Override
+  public PreemptionManager getPreemptionManager() {
+    return preemptionManager;
+  }
 }
 }

+ 12 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java

@@ -264,6 +264,12 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
   public static final String RESERVATION_ENFORCEMENT_WINDOW =
   public static final String RESERVATION_ENFORCEMENT_WINDOW =
       "reservation-enforcement-window";
       "reservation-enforcement-window";
 
 
+  @Private
+  public static final String LAZY_PREEMPTION_ENALBED = PREFIX + "lazy-preemption-enabled";
+
+  @Private
+  public static final boolean DEFAULT_LAZY_PREEMPTION_ENABLED = false;
+
   public CapacitySchedulerConfiguration() {
   public CapacitySchedulerConfiguration() {
     this(new Configuration());
     this(new Configuration());
   }
   }
@@ -1028,7 +1034,11 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
   @VisibleForTesting
   @VisibleForTesting
   public void setOrderingPolicyParameter(String queue,
   public void setOrderingPolicyParameter(String queue,
       String parameterKey, String parameterValue) {
       String parameterKey, String parameterValue) {
-    set(getQueuePrefix(queue) + ORDERING_POLICY + "."
-        + parameterKey, parameterValue);
+    set(getQueuePrefix(queue) + ORDERING_POLICY + "." + parameterKey,
+        parameterValue);
+  }
+
+  public boolean getLazyPreemptionEnabled() {
+    return getBoolean(LAZY_PREEMPTION_ENALBED, DEFAULT_LAZY_PREEMPTION_ENABLED);
   }
   }
 }
 }

+ 5 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerContext.java

@@ -25,6 +25,7 @@ import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
 import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
@@ -75,4 +76,8 @@ public interface CapacitySchedulerContext {
    *         cluster.
    *         cluster.
    */
    */
   ResourceUsage getClusterResourceUsage();
   ResourceUsage getClusterResourceUsage();
+
+  PreemptionManager getPreemptionManager();
+
+  long getLastNodeUpdateTime();
 }
 }

+ 69 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java

@@ -28,9 +28,11 @@ import org.apache.hadoop.classification.InterfaceStability.Unstable;
 import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.authorize.AccessControlList;
 import org.apache.hadoop.security.authorize.AccessControlList;
+import org.apache.hadoop.util.Time;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.Container;
 import org.apache.hadoop.yarn.api.records.Container;
+import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
 import org.apache.hadoop.yarn.api.records.ContainerStatus;
 import org.apache.hadoop.yarn.api.records.ContainerStatus;
 import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.QueueACL;
 import org.apache.hadoop.yarn.api.records.QueueACL;
@@ -54,7 +56,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedContainerChangeRequest;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedContainerChangeRequest;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt.AMState;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt.AMState;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerHealth;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.KillableContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.FifoOrderingPolicyForPendingApps;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.FifoOrderingPolicyForPendingApps;
@@ -854,6 +858,40 @@ public class LeafQueue extends AbstractCSQueue {
       assignment.setExcessReservation(null);
       assignment.setExcessReservation(null);
     }
     }
   }
   }
+
+  private void killToPreemptContainers(Resource clusterResource,
+      FiCaSchedulerNode node,
+      CSAssignment assignment) {
+    if (assignment.getContainersToKill() != null) {
+      StringBuilder sb = new StringBuilder("Killing containers: [");
+
+      for (RMContainer c : assignment.getContainersToKill()) {
+        FiCaSchedulerApp application = csContext.getApplicationAttempt(
+            c.getApplicationAttemptId());
+        LeafQueue q = application.getCSLeafQueue();
+        q.completedContainer(clusterResource, application, node, c, SchedulerUtils
+                .createPreemptedContainerStatus(c.getContainerId(),
+                    SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL,
+            null, false);
+        sb.append("(container=" + c.getContainerId() + " resource=" + c
+            .getAllocatedResource() + ")");
+      }
+
+      sb.append("] for container=" + assignment.getAssignmentInformation()
+          .getFirstAllocatedOrReservedContainerId() + " resource=" + assignment
+          .getResource());
+      LOG.info(sb.toString());
+
+    }
+  }
+
+  private void setPreemptionAllowed(ResourceLimits limits, String nodePartition) {
+    // Set preemption-allowed:
+    // For leaf queue, only under-utilized queue is allowed to preempt resources from other queues
+    float usedCapacity = queueCapacities.getAbsoluteUsedCapacity(nodePartition);
+    float guaranteedCapacity = queueCapacities.getAbsoluteCapacity(nodePartition);
+    limits.setIsAllowPreemption(usedCapacity < guaranteedCapacity);
+  }
   
   
   @Override
   @Override
   public synchronized CSAssignment assignContainers(Resource clusterResource,
   public synchronized CSAssignment assignContainers(Resource clusterResource,
@@ -866,6 +904,8 @@ public class LeafQueue extends AbstractCSQueue {
           + " #applications=" + orderingPolicy.getNumSchedulableEntities());
           + " #applications=" + orderingPolicy.getNumSchedulableEntities());
     }
     }
 
 
+    setPreemptionAllowed(currentResourceLimits, node.getPartition());
+
     // Check for reserved resources
     // Check for reserved resources
     RMContainer reservedContainer = node.getReservedContainer();
     RMContainer reservedContainer = node.getReservedContainer();
     if (reservedContainer != null) {
     if (reservedContainer != null) {
@@ -877,6 +917,7 @@ public class LeafQueue extends AbstractCSQueue {
                 currentResourceLimits, schedulingMode, reservedContainer);
                 currentResourceLimits, schedulingMode, reservedContainer);
         handleExcessReservedContainer(clusterResource, assignment, node,
         handleExcessReservedContainer(clusterResource, assignment, node,
             application);
             application);
+        killToPreemptContainers(clusterResource, node, assignment);
         return assignment;
         return assignment;
       }
       }
     }
     }
@@ -938,6 +979,7 @@ public class LeafQueue extends AbstractCSQueue {
       
       
       handleExcessReservedContainer(clusterResource, assignment, node,
       handleExcessReservedContainer(clusterResource, assignment, node,
           application);
           application);
+      killToPreemptContainers(clusterResource, node, assignment);
 
 
       if (Resources.greaterThan(resourceCalculator, clusterResource, assigned,
       if (Resources.greaterThan(resourceCalculator, clusterResource, assigned,
           Resources.none())) {
           Resources.none())) {
@@ -1328,11 +1370,34 @@ public class LeafQueue extends AbstractCSQueue {
   }
   }
 
 
 
 
+  private void updateSchedulerHealthForCompletedContainer(
+      RMContainer rmContainer, ContainerStatus containerStatus) {
+    // Update SchedulerHealth for released / preempted container
+    SchedulerHealth schedulerHealth = csContext.getSchedulerHealth();
+    if (null == schedulerHealth) {
+      // Only do update if we have schedulerHealth
+      return;
+    }
+
+    if (containerStatus.getExitStatus() == ContainerExitStatus.PREEMPTED) {
+      schedulerHealth.updatePreemption(Time.now(), rmContainer.getAllocatedNode(),
+          rmContainer.getContainerId(), getQueuePath());
+      schedulerHealth.updateSchedulerPreemptionCounts(1);
+    } else {
+      schedulerHealth.updateRelease(csContext.getLastNodeUpdateTime(),
+          rmContainer.getAllocatedNode(), rmContainer.getContainerId(),
+          getQueuePath());
+    }
+  }
+
   @Override
   @Override
   public void completedContainer(Resource clusterResource, 
   public void completedContainer(Resource clusterResource, 
       FiCaSchedulerApp application, FiCaSchedulerNode node, RMContainer rmContainer, 
       FiCaSchedulerApp application, FiCaSchedulerNode node, RMContainer rmContainer, 
       ContainerStatus containerStatus, RMContainerEventType event, CSQueue childQueue,
       ContainerStatus containerStatus, RMContainerEventType event, CSQueue childQueue,
       boolean sortQueues) {
       boolean sortQueues) {
+    // Update SchedulerHealth for released / preempted container
+    updateSchedulerHealthForCompletedContainer(rmContainer, containerStatus);
+
     if (application != null) {
     if (application != null) {
       // unreserve container increase request if it previously reserved.
       // unreserve container increase request if it previously reserved.
       if (rmContainer.hasIncreaseReservation()) {
       if (rmContainer.hasIncreaseReservation()) {
@@ -1383,6 +1448,10 @@ public class LeafQueue extends AbstractCSQueue {
           rmContainer, null, event, this, sortQueues);
           rmContainer, null, event, this, sortQueues);
       }
       }
     }
     }
+
+    // Notify PreemptionManager
+    csContext.getPreemptionManager().removeKillableContainer(
+        new KillableContainer(rmContainer, node.getPartition(), queueName));
   }
   }
 
 
   synchronized void allocateResource(Resource clusterResource,
   synchronized void allocateResource(Resource clusterResource,

+ 123 - 32
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java

@@ -18,18 +18,6 @@
 
 
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
 
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.commons.logging.LogFactory;
@@ -49,6 +37,7 @@ import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
 import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
 import org.apache.hadoop.yarn.factories.RecordFactory;
 import org.apache.hadoop.yarn.factories.RecordFactory;
 import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
 import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
+import org.apache.hadoop.yarn.nodelabels.RMNodeLabel;
 import org.apache.hadoop.yarn.security.AccessType;
 import org.apache.hadoop.yarn.security.AccessType;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
@@ -57,12 +46,25 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerStat
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
-import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedContainerChangeRequest;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedContainerChangeRequest;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.util.resource.Resources;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+
 @Private
 @Private
 @Evolving
 @Evolving
 public class ParentQueue extends AbstractCSQueue {
 public class ParentQueue extends AbstractCSQueue {
@@ -398,6 +400,11 @@ public class ParentQueue extends AbstractCSQueue {
     // if our queue cannot access this node, just return
     // if our queue cannot access this node, just return
     if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
     if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
         && !accessibleToPartition(node.getPartition())) {
         && !accessibleToPartition(node.getPartition())) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Skip this queue=" + getQueuePath()
+            + ", because it is not able to access partition=" + node
+            .getPartition());
+      }
       return CSAssignment.NULL_ASSIGNMENT;
       return CSAssignment.NULL_ASSIGNMENT;
     }
     }
     
     
@@ -443,7 +450,7 @@ public class ParentQueue extends AbstractCSQueue {
               resourceCalculator, clusterResource, 
               resourceCalculator, clusterResource, 
               assignedToChild.getResource(), Resources.none())) {
               assignedToChild.getResource(), Resources.none())) {
         // Track resource utilization for the parent-queue
         // Track resource utilization for the parent-queue
-        super.allocateResource(clusterResource, assignedToChild.getResource(),
+        allocateResource(clusterResource, assignedToChild.getResource(),
             node.getPartition(), assignedToChild.isIncreasedAllocation());
             node.getPartition(), assignedToChild.isIncreasedAllocation());
         
         
         // Track resource utilization in this pass of the scheduler
         // Track resource utilization in this pass of the scheduler
@@ -512,29 +519,38 @@ public class ParentQueue extends AbstractCSQueue {
   }
   }
 
 
   private boolean canAssign(Resource clusterResource, FiCaSchedulerNode node) {
   private boolean canAssign(Resource clusterResource, FiCaSchedulerNode node) {
-    return (node.getReservedContainer() == null) && 
-        Resources.greaterThanOrEqual(resourceCalculator, clusterResource, 
-            node.getAvailableResource(), minimumAllocation);
+    // Two conditions need to meet when trying to allocate:
+    // 1) Node doesn't have reserved container
+    // 2) Node's available-resource + killable-resource should > 0
+    return node.getReservedContainer() == null && Resources.greaterThanOrEqual(
+        resourceCalculator, clusterResource, Resources
+            .add(node.getAvailableResource(), node.getTotalKillableResources()),
+        minimumAllocation);
   }
   }
-  
+
   private ResourceLimits getResourceLimitsOfChild(CSQueue child,
   private ResourceLimits getResourceLimitsOfChild(CSQueue child,
-      Resource clusterResource, Resource parentLimits) {
+      Resource clusterResource, Resource parentLimits,
+      String nodePartition) {
     // Set resource-limit of a given child, child.limit =
     // Set resource-limit of a given child, child.limit =
     // min(my.limit - my.used + child.used, child.max)
     // min(my.limit - my.used + child.used, child.max)
 
 
     // Parent available resource = parent-limit - parent-used-resource
     // Parent available resource = parent-limit - parent-used-resource
-    Resource parentMaxAvailableResource =
-        Resources.subtract(parentLimits, getUsedResources());
+    Resource parentMaxAvailableResource = Resources.subtract(
+        parentLimits, queueUsage.getUsed(nodePartition));
+    // Deduct killable from used
+    Resources.addTo(parentMaxAvailableResource,
+        getTotalKillableResource(nodePartition));
 
 
     // Child's limit = parent-available-resource + child-used
     // Child's limit = parent-available-resource + child-used
-    Resource childLimit =
-        Resources.add(parentMaxAvailableResource, child.getUsedResources());
+    Resource childLimit = Resources.add(parentMaxAvailableResource,
+        child.getQueueResourceUsage().getUsed(nodePartition));
 
 
     // Get child's max resource
     // Get child's max resource
-    Resource childConfiguredMaxResource =
-        Resources.multiplyAndNormalizeDown(resourceCalculator, labelManager
-            .getResourceByLabel(RMNodeLabelsManager.NO_LABEL, clusterResource),
-            child.getAbsoluteMaximumCapacity(), minimumAllocation);
+    Resource childConfiguredMaxResource = Resources.multiplyAndNormalizeDown(
+        resourceCalculator,
+        labelManager.getResourceByLabel(nodePartition, clusterResource),
+        child.getQueueCapacities().getAbsoluteMaximumCapacity(nodePartition),
+        minimumAllocation);
 
 
     // Child's limit should be capped by child configured max resource
     // Child's limit should be capped by child configured max resource
     childLimit =
     childLimit =
@@ -586,7 +602,7 @@ public class ParentQueue extends AbstractCSQueue {
 
 
       // Get ResourceLimits of child queue before assign containers
       // Get ResourceLimits of child queue before assign containers
       ResourceLimits childLimits =
       ResourceLimits childLimits =
-          getResourceLimitsOfChild(childQueue, cluster, parentLimits);
+          getResourceLimitsOfChild(childQueue, cluster, parentLimits, node.getPartition());
       
       
       CSAssignment childAssignment = childQueue.assignContainers(cluster, node,
       CSAssignment childAssignment = childQueue.assignContainers(cluster, node,
           childLimits, schedulingMode);
           childLimits, schedulingMode);
@@ -751,7 +767,7 @@ public class ParentQueue extends AbstractCSQueue {
       // Get ResourceLimits of child queue before assign containers
       // Get ResourceLimits of child queue before assign containers
       ResourceLimits childLimits =
       ResourceLimits childLimits =
           getResourceLimitsOfChild(childQueue, clusterResource,
           getResourceLimitsOfChild(childQueue, clusterResource,
-              resourceLimits.getLimit());
+              resourceLimits.getLimit(), RMNodeLabelsManager.NO_LABEL);
       childQueue.updateClusterResource(clusterResource, childLimits);
       childQueue.updateClusterResource(clusterResource, childLimits);
     }
     }
     
     
@@ -774,8 +790,8 @@ public class ParentQueue extends AbstractCSQueue {
     synchronized (this) {
     synchronized (this) {
       FiCaSchedulerNode node =
       FiCaSchedulerNode node =
           scheduler.getNode(rmContainer.getContainer().getNodeId());
           scheduler.getNode(rmContainer.getContainer().getNodeId());
-      super.allocateResource(clusterResource, rmContainer.getContainer()
-          .getResource(), node.getPartition(), false);
+      allocateResource(clusterResource,
+          rmContainer.getContainer().getResource(), node.getPartition(), false);
     }
     }
     if (parent != null) {
     if (parent != null) {
       parent.recoverContainer(clusterResource, attempt, rmContainer);
       parent.recoverContainer(clusterResource, attempt, rmContainer);
@@ -802,7 +818,7 @@ public class ParentQueue extends AbstractCSQueue {
     if (application != null) {
     if (application != null) {
       FiCaSchedulerNode node =
       FiCaSchedulerNode node =
           scheduler.getNode(rmContainer.getContainer().getNodeId());
           scheduler.getNode(rmContainer.getContainer().getNodeId());
-      super.allocateResource(clusterResource, rmContainer.getContainer()
+      allocateResource(clusterResource, rmContainer.getContainer()
           .getResource(), node.getPartition(), false);
           .getResource(), node.getPartition(), false);
       LOG.info("movedContainer" + " queueMoveIn=" + getQueueName()
       LOG.info("movedContainer" + " queueMoveIn=" + getQueueName()
           + " usedCapacity=" + getUsedCapacity() + " absoluteUsedCapacity="
           + " usedCapacity=" + getUsedCapacity() + " absoluteUsedCapacity="
@@ -838,4 +854,79 @@ public class ParentQueue extends AbstractCSQueue {
   public synchronized int getNumApplications() {
   public synchronized int getNumApplications() {
     return numApplications;
     return numApplications;
   }
   }
+
+  synchronized void allocateResource(Resource clusterResource,
+      Resource resource, String nodePartition, boolean changeContainerResource) {
+    super.allocateResource(clusterResource, resource, nodePartition,
+        changeContainerResource);
+
+    /**
+     * check if we need to kill (killable) containers if maximum resource violated.
+     * Doing this because we will deduct killable resource when going from root.
+     * For example:
+     * <pre>
+     *      Root
+     *      /   \
+     *     a     b
+     *   /  \
+     *  a1  a2
+     * </pre>
+     *
+     * a: max=10G, used=10G, killable=2G
+     * a1: used=8G, killable=2G
+     * a2: used=2G, pending=2G, killable=0G
+     *
+     * When we get queue-a to allocate resource, even if queue-a
+     * reaches its max resource, we deduct its used by killable, so we can allocate
+     * at most 2G resources. ResourceLimits passed down to a2 has headroom set to 2G.
+     *
+     * If scheduler finds a 2G available resource in existing cluster, and assigns it
+     * to a2, now a2's used= 2G + 2G = 4G, and a's used = 8G + 4G = 12G > 10G
+     *
+     * When this happens, we have to preempt killable container (on same or different
+     * nodes) of parent queue to avoid violating parent's max resource.
+     */
+    if (getQueueCapacities().getAbsoluteMaximumCapacity(nodePartition)
+        < getQueueCapacities().getAbsoluteUsedCapacity(nodePartition)) {
+      killContainersToEnforceMaxQueueCapacity(nodePartition, clusterResource);
+    }
+  }
+
+  private void killContainersToEnforceMaxQueueCapacity(String partition,
+      Resource clusterResource) {
+    Iterator<RMContainer> killableContainerIter = getKillableContainers(
+        partition);
+    if (!killableContainerIter.hasNext()) {
+      return;
+    }
+
+    Resource partitionResource = labelManager.getResourceByLabel(partition,
+        null);
+    Resource maxResource = Resources.multiply(partitionResource,
+        getQueueCapacities().getAbsoluteMaximumCapacity(partition));
+
+    while (Resources.greaterThan(resourceCalculator, partitionResource,
+        queueUsage.getUsed(partition), maxResource)) {
+      RMContainer toKillContainer = killableContainerIter.next();
+      FiCaSchedulerApp attempt = csContext.getApplicationAttempt(
+          toKillContainer.getContainerId().getApplicationAttemptId());
+      FiCaSchedulerNode node = csContext.getNode(
+          toKillContainer.getAllocatedNode());
+      if (null != attempt && null != node) {
+        LeafQueue lq = attempt.getCSLeafQueue();
+        lq.completedContainer(clusterResource, attempt, node, toKillContainer,
+            SchedulerUtils.createPreemptedContainerStatus(
+                toKillContainer.getContainerId(),
+                SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL,
+            null, false);
+        LOG.info("Killed container=" + toKillContainer.getContainerId()
+            + " from queue=" + lq.getQueueName() + " to make queue=" + this
+            .getQueueName() + "'s max-capacity enforced");
+      }
+
+      if (!killableContainerIter.hasNext()) {
+        break;
+      }
+    }
+  }
 }
 }

+ 3 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java

@@ -110,6 +110,8 @@ public abstract class AbstractContainerAllocator {
           assignment.setFulfilledReservation(true);
           assignment.setFulfilledReservation(true);
         }
         }
       }
       }
+
+      assignment.setContainersToKill(result.getToKillContainers());
     } else {
     } else {
       if (result.getAllocationState() == AllocationState.QUEUE_SKIPPED) {
       if (result.getAllocationState() == AllocationState.QUEUE_SKIPPED) {
         assignment.setSkippedType(
         assignment.setSkippedType(
@@ -135,4 +137,4 @@ public abstract class AbstractContainerAllocator {
   public abstract CSAssignment assignContainers(Resource clusterResource,
   public abstract CSAssignment assignContainers(Resource clusterResource,
       FiCaSchedulerNode node, SchedulingMode schedulingMode,
       FiCaSchedulerNode node, SchedulingMode schedulingMode,
       ResourceLimits resourceLimits, RMContainer reservedContainer);
       ResourceLimits resourceLimits, RMContainer reservedContainer);
-}
+}

+ 12 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java

@@ -19,11 +19,14 @@
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.allocator;
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.allocator;
 
 
 import org.apache.hadoop.yarn.api.records.Container;
 import org.apache.hadoop.yarn.api.records.Container;
+import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
 import org.apache.hadoop.yarn.util.resource.Resources;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
 
+import java.util.List;
+
 public class ContainerAllocation {
 public class ContainerAllocation {
   /**
   /**
    * Skip the locality (e.g. node-local, rack-local, any), and look at other
    * Skip the locality (e.g. node-local, rack-local, any), and look at other
@@ -56,6 +59,7 @@ public class ContainerAllocation {
   NodeType containerNodeType = NodeType.NODE_LOCAL;
   NodeType containerNodeType = NodeType.NODE_LOCAL;
   NodeType requestNodeType = NodeType.NODE_LOCAL;
   NodeType requestNodeType = NodeType.NODE_LOCAL;
   Container updatedContainer;
   Container updatedContainer;
+  private List<RMContainer> toKillContainers;
 
 
   public ContainerAllocation(RMContainer containerToBeUnreserved,
   public ContainerAllocation(RMContainer containerToBeUnreserved,
       Resource resourceToBeAllocated, AllocationState state) {
       Resource resourceToBeAllocated, AllocationState state) {
@@ -86,4 +90,12 @@ public class ContainerAllocation {
   public Container getUpdatedContainer() {
   public Container getUpdatedContainer() {
     return updatedContainer;
     return updatedContainer;
   }
   }
+
+  public void setToKillContainers(List<RMContainer> toKillContainers) {
+    this.toKillContainers = toKillContainers;
+  }
+
+  public List<RMContainer> getToKillContainers() {
+    return toKillContainers;
+  }
 }
 }

+ 30 - 9
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java

@@ -43,6 +43,9 @@ import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.Resources;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
 
+import java.util.ArrayList;
+import java.util.List;
+
 /**
 /**
  * Allocate normal (new) containers, considers locality/label, etc. Using
  * Allocate normal (new) containers, considers locality/label, etc. Using
  * delayed scheduling mechanism to get better locality allocation.
  * delayed scheduling mechanism to get better locality allocation.
@@ -436,9 +439,6 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
       return ContainerAllocation.LOCALITY_SKIPPED;
       return ContainerAllocation.LOCALITY_SKIPPED;
     }
     }
 
 
-    assert Resources.greaterThan(
-        rc, clusterResource, available, Resources.none());
-
     boolean shouldAllocOrReserveNewContainer = shouldAllocOrReserveNewContainer(
     boolean shouldAllocOrReserveNewContainer = shouldAllocOrReserveNewContainer(
         priority, capability);
         priority, capability);
 
 
@@ -461,6 +461,29 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
     boolean reservationsContinueLooking =
     boolean reservationsContinueLooking =
         application.getCSLeafQueue().getReservationContinueLooking();
         application.getCSLeafQueue().getReservationContinueLooking();
 
 
+    // Check if we need to kill some containers to allocate this one
+    List<RMContainer> toKillContainers = null;
+    if (availableContainers == 0 && currentResoureLimits.isAllowPreemption()) {
+      Resource availableAndKillable = Resources.clone(available);
+      for (RMContainer killableContainer : node
+          .getKillableContainers().values()) {
+        if (null == toKillContainers) {
+          toKillContainers = new ArrayList<>();
+        }
+        toKillContainers.add(killableContainer);
+        Resources.addTo(availableAndKillable,
+                        killableContainer.getAllocatedResource());
+        if (Resources.fitsIn(rc,
+                             clusterResource,
+                             capability,
+                             availableAndKillable)) {
+          // Stop if we find enough spaces
+          availableContainers = 1;
+          break;
+        }
+      }
+    }
+
     if (availableContainers > 0) {
     if (availableContainers > 0) {
       // Allocate...
       // Allocate...
       // We will only do continuous reservation when this is not allocated from
       // We will only do continuous reservation when this is not allocated from
@@ -500,12 +523,12 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
           new ContainerAllocation(unreservedContainer, request.getCapability(),
           new ContainerAllocation(unreservedContainer, request.getCapability(),
               AllocationState.ALLOCATED);
               AllocationState.ALLOCATED);
       result.containerNodeType = type;
       result.containerNodeType = type;
+      result.setToKillContainers(toKillContainers);
       return result;
       return result;
     } else {
     } else {
       // if we are allowed to allocate but this node doesn't have space, reserve
       // if we are allowed to allocate but this node doesn't have space, reserve
       // it or if this was an already a reserved container, reserve it again
       // it or if this was an already a reserved container, reserve it again
       if (shouldAllocOrReserveNewContainer || rmContainer != null) {
       if (shouldAllocOrReserveNewContainer || rmContainer != null) {
-
         if (reservationsContinueLooking && rmContainer == null) {
         if (reservationsContinueLooking && rmContainer == null) {
           // we could possibly ignoring queue capacity or user limits when
           // we could possibly ignoring queue capacity or user limits when
           // reservationsContinueLooking is set. Make sure we didn't need to
           // reservationsContinueLooking is set. Make sure we didn't need to
@@ -523,6 +546,7 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
             new ContainerAllocation(null, request.getCapability(),
             new ContainerAllocation(null, request.getCapability(),
                 AllocationState.RESERVED);
                 AllocationState.RESERVED);
         result.containerNodeType = type;
         result.containerNodeType = type;
+        result.setToKillContainers(null);
         return result;
         return result;
       }
       }
       // Skip the locality request
       // Skip the locality request
@@ -614,8 +638,7 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
   }
   }
 
 
   ContainerAllocation doAllocation(ContainerAllocation allocationResult,
   ContainerAllocation doAllocation(ContainerAllocation allocationResult,
-      Resource clusterResource, FiCaSchedulerNode node,
-      SchedulingMode schedulingMode, Priority priority,
+      FiCaSchedulerNode node, Priority priority,
       RMContainer reservedContainer) {
       RMContainer reservedContainer) {
     // Create the container if necessary
     // Create the container if necessary
     Container container =
     Container container =
@@ -686,9 +709,7 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
 
 
     if (AllocationState.ALLOCATED == result.state
     if (AllocationState.ALLOCATED == result.state
         || AllocationState.RESERVED == result.state) {
         || AllocationState.RESERVED == result.state) {
-      result =
-          doAllocation(result, clusterResource, node, schedulingMode, priority,
-              reservedContainer);
+      result = doAllocation(result, node, priority, reservedContainer);
     }
     }
 
 
     return result;
     return result;

+ 45 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/preemption/KillableContainer.java

@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption;
+
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
+
+public class KillableContainer {
+  RMContainer container;
+  String partition;
+  String leafQueueName;
+
+  public KillableContainer(RMContainer container, String partition, String leafQueueName) {
+    this.container = container;
+    this.partition = partition;
+    this.leafQueueName = leafQueueName;
+  }
+
+  public RMContainer getRMContainer() {
+    return this.container;
+  }
+
+  public String getNodePartition() {
+    return this.partition;
+  }
+
+  public String getLeafQueueName() {
+    return this.leafQueueName;
+  }
+}

+ 102 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/preemption/PreemptableQueue.java

@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption;
+
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
+import org.apache.hadoop.yarn.util.resource.Resources;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ConcurrentSkipListMap;
+
+public class PreemptableQueue {
+  // Partition -> killable resources and containers
+  private Map<String, Resource> totalKillableResources = new HashMap<>();
+  private Map<String, Map<ContainerId, RMContainer>> killableContainers =
+      new HashMap<>();
+  private PreemptableQueue parent;
+
+  public PreemptableQueue(PreemptableQueue parent) {
+    this.parent = parent;
+  }
+
+  public PreemptableQueue(Map<String, Resource> totalKillableResources,
+      Map<String, Map<ContainerId, RMContainer>> killableContainers) {
+    this.totalKillableResources = totalKillableResources;
+    this.killableContainers = killableContainers;
+  }
+
+  void addKillableContainer(KillableContainer container) {
+    String partition = container.getNodePartition();
+    if (!totalKillableResources.containsKey(partition)) {
+      totalKillableResources.put(partition, Resources.createResource(0));
+      killableContainers.put(partition,
+          new ConcurrentSkipListMap<ContainerId, RMContainer>());
+    }
+
+    RMContainer c = container.getRMContainer();
+    Resources.addTo(totalKillableResources.get(partition),
+        c.getAllocatedResource());
+    killableContainers.get(partition).put(c.getContainerId(), c);
+
+    if (null != parent) {
+      parent.addKillableContainer(container);
+    }
+  }
+
+  void removeKillableContainer(KillableContainer container) {
+    String partition = container.getNodePartition();
+    Map<ContainerId, RMContainer> partitionKillableContainers =
+        killableContainers.get(partition);
+    if (partitionKillableContainers != null) {
+      RMContainer rmContainer = partitionKillableContainers.remove(
+          container.getRMContainer().getContainerId());
+      if (null != rmContainer) {
+        Resources.subtractFrom(totalKillableResources.get(partition),
+            rmContainer.getAllocatedResource());
+      }
+    }
+
+    if (null != parent) {
+      parent.removeKillableContainer(container);
+    }
+  }
+
+  public Resource getKillableResource(String partition) {
+    Resource res = totalKillableResources.get(partition);
+    return res == null ? Resources.none() : res;
+  }
+
+  @SuppressWarnings("unchecked")
+  public Map<ContainerId, RMContainer> getKillableContainers(String partition) {
+    Map<ContainerId, RMContainer> map = killableContainers.get(partition);
+    return map == null ? Collections.EMPTY_MAP : map;
+  }
+
+  public Map<String, Map<ContainerId, RMContainer>> getKillableContainers() {
+    return killableContainers;
+  }
+
+  Map<String, Resource> getTotalKillableResources() {
+    return totalKillableResources;
+  }
+}

+ 165 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/preemption/PreemptionManager.java

@@ -0,0 +1,165 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
+import org.apache.hadoop.yarn.util.resource.Resources;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+public class PreemptionManager {
+  private ReentrantReadWriteLock.ReadLock readLock;
+  private ReentrantReadWriteLock.WriteLock writeLock;
+  private Map<String, PreemptableQueue> entities = new HashMap<>();
+
+  public PreemptionManager() {
+    ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
+    readLock = lock.readLock();
+    writeLock = lock.writeLock();
+  }
+
+  public void refreshQueues(CSQueue parent, CSQueue current) {
+    try {
+      writeLock.lock();
+      PreemptableQueue parentEntity = null;
+      if (parent != null) {
+        parentEntity = entities.get(parent.getQueueName());
+      }
+
+      if (!entities.containsKey(current.getQueueName())) {
+        entities.put(current.getQueueName(),
+            new PreemptableQueue(parentEntity));
+      }
+
+      if (current.getChildQueues() != null) {
+        for (CSQueue child : current.getChildQueues()) {
+          refreshQueues(current, child);
+        }
+      }
+    }
+    finally {
+      writeLock.unlock();
+    }
+  }
+
+  public void addKillableContainer(KillableContainer container) {
+    try {
+      writeLock.lock();
+      PreemptableQueue entity = entities.get(container.getLeafQueueName());
+      if (null != entity) {
+        entity.addKillableContainer(container);
+      }
+    }
+    finally {
+      writeLock.unlock();
+    }
+  }
+
+  public void removeKillableContainer(KillableContainer container) {
+    try {
+      writeLock.lock();
+      PreemptableQueue entity = entities.get(container.getLeafQueueName());
+      if (null != entity) {
+        entity.removeKillableContainer(container);
+      }
+    }
+    finally {
+      writeLock.unlock();
+    }
+  }
+
+  public void moveKillableContainer(KillableContainer oldContainer,
+      KillableContainer newContainer) {
+    // TODO, will be called when partition of the node changed OR
+    // container moved to different queue
+  }
+
+  public void updateKillableContainerResource(KillableContainer container,
+      Resource oldResource, Resource newResource) {
+    // TODO, will be called when container's resource changed
+  }
+
+  @VisibleForTesting
+  public Map<ContainerId, RMContainer> getKillableContainersMap(
+      String queueName, String partition) {
+    try {
+      readLock.lock();
+      PreemptableQueue entity = entities.get(queueName);
+      if (entity != null) {
+        Map<ContainerId, RMContainer> containers =
+            entity.getKillableContainers().get(partition);
+        if (containers != null) {
+          return containers;
+        }
+      }
+      return Collections.emptyMap();
+    }
+    finally {
+      readLock.unlock();
+    }
+  }
+
+  public Iterator<RMContainer> getKillableContainers(String queueName,
+      String partition) {
+    return getKillableContainersMap(queueName, partition).values().iterator();
+  }
+
+  public Resource getKillableResource(String queueName, String partition) {
+    try {
+      readLock.lock();
+      PreemptableQueue entity = entities.get(queueName);
+      if (entity != null) {
+        Resource res = entity.getTotalKillableResources().get(partition);
+        if (res == null || res.equals(Resources.none())) {
+          return Resources.none();
+        }
+        return Resources.clone(res);
+      }
+      return Resources.none();
+    }
+    finally {
+      readLock.unlock();
+    }
+  }
+
+  public Map<String, PreemptableQueue> getShallowCopyOfPreemptableEntities() {
+    try {
+      readLock.lock();
+      Map<String, PreemptableQueue> map = new HashMap<>();
+      for (Map.Entry<String, PreemptableQueue> entry : entities.entrySet()) {
+        String key = entry.getKey();
+        PreemptableQueue entity = entry.getValue();
+        map.put(key, new PreemptableQueue(
+            new HashMap<>(entity.getTotalKillableResources()),
+            new HashMap<>(entity.getKillableContainers())));
+      }
+      return map;
+    } finally {
+      readLock.unlock();
+    }
+  }
+}

+ 3 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/AssignmentInformation.java

@@ -120,9 +120,9 @@ public class AssignmentInformation {
   }
   }
 
 
   private ContainerId getFirstContainerIdFromOperation(Operation op) {
   private ContainerId getFirstContainerIdFromOperation(Operation op) {
-    if (null != operationDetails.get(Operation.ALLOCATION)) {
+    if (null != operationDetails.get(op)) {
       List<AssignmentDetails> assignDetails =
       List<AssignmentDetails> assignDetails =
-          operationDetails.get(Operation.ALLOCATION);
+          operationDetails.get(op);
       if (!assignDetails.isEmpty()) {
       if (!assignDetails.isEmpty()) {
         return assignDetails.get(0).containerId;
         return assignDetails.get(0).containerId;
       }
       }
@@ -131,7 +131,7 @@ public class AssignmentInformation {
   }
   }
 
 
   public ContainerId getFirstAllocatedOrReservedContainerId() {
   public ContainerId getFirstAllocatedOrReservedContainerId() {
-    ContainerId containerId = null;
+    ContainerId containerId;
     containerId = getFirstContainerIdFromOperation(Operation.ALLOCATION);
     containerId = getFirstContainerIdFromOperation(Operation.ALLOCATION);
     if (null != containerId) {
     if (null != containerId) {
       return containerId;
       return containerId;

+ 19 - 13
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java

@@ -61,6 +61,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAMCont
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAssignment;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAssignment;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityHeadroomProvider;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityHeadroomProvider;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerContext;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode;
@@ -96,6 +97,7 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
    * to hold the message if its app doesn't not get container from a node
    * to hold the message if its app doesn't not get container from a node
    */
    */
   private String appSkipNodeDiagnostics;
   private String appSkipNodeDiagnostics;
+  private CapacitySchedulerContext capacitySchedulerContext;
 
 
   public FiCaSchedulerApp(ApplicationAttemptId applicationAttemptId, 
   public FiCaSchedulerApp(ApplicationAttemptId applicationAttemptId, 
       String user, Queue queue, ActiveUsersManager activeUsersManager,
       String user, Queue queue, ActiveUsersManager activeUsersManager,
@@ -140,28 +142,30 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
     }
     }
     
     
     containerAllocator = new ContainerAllocator(this, rc, rmContext);
     containerAllocator = new ContainerAllocator(this, rc, rmContext);
+
+    if (scheduler instanceof CapacityScheduler) {
+      capacitySchedulerContext = (CapacitySchedulerContext) scheduler;
+    }
   }
   }
 
 
-  synchronized public boolean containerCompleted(RMContainer rmContainer,
+  public synchronized boolean containerCompleted(RMContainer rmContainer,
       ContainerStatus containerStatus, RMContainerEventType event,
       ContainerStatus containerStatus, RMContainerEventType event,
       String partition) {
       String partition) {
+    ContainerId containerId = rmContainer.getContainerId();
 
 
     // Remove from the list of containers
     // Remove from the list of containers
-    if (null == liveContainers.remove(rmContainer.getContainerId())) {
+    if (null == liveContainers.remove(containerId)) {
       return false;
       return false;
     }
     }
-    
+
     // Remove from the list of newly allocated containers if found
     // Remove from the list of newly allocated containers if found
     newlyAllocatedContainers.remove(rmContainer);
     newlyAllocatedContainers.remove(rmContainer);
 
 
-    Container container = rmContainer.getContainer();
-    ContainerId containerId = container.getId();
-
     // Inform the container
     // Inform the container
     rmContainer.handle(
     rmContainer.handle(
         new RMContainerFinishedEvent(containerId, containerStatus, event));
         new RMContainerFinishedEvent(containerId, containerStatus, event));
 
 
-    containersToPreempt.remove(rmContainer.getContainerId());
+    containersToPreempt.remove(containerId);
 
 
     RMAuditLogger.logSuccess(getUser(),
     RMAuditLogger.logSuccess(getUser(),
         AuditConstants.RELEASE_CONTAINER, "SchedulerApp",
         AuditConstants.RELEASE_CONTAINER, "SchedulerApp",
@@ -178,7 +182,7 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
     return true;
     return true;
   }
   }
 
 
-  synchronized public RMContainer allocate(NodeType type, FiCaSchedulerNode node,
+  public synchronized RMContainer allocate(NodeType type, FiCaSchedulerNode node,
       Priority priority, ResourceRequest request, 
       Priority priority, ResourceRequest request, 
       Container container) {
       Container container) {
 
 
@@ -202,7 +206,9 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
 
 
     // Add it to allContainers list.
     // Add it to allContainers list.
     newlyAllocatedContainers.add(rmContainer);
     newlyAllocatedContainers.add(rmContainer);
-    liveContainers.put(container.getId(), rmContainer);    
+
+    ContainerId containerId = container.getId();
+    liveContainers.put(containerId, rmContainer);
 
 
     // Update consumption and track allocations
     // Update consumption and track allocations
     List<ResourceRequest> resourceRequestList = appSchedulingInfo.allocate(
     List<ResourceRequest> resourceRequestList = appSchedulingInfo.allocate(
@@ -215,17 +221,17 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
 
 
     // Inform the container
     // Inform the container
     rmContainer.handle(
     rmContainer.handle(
-        new RMContainerEvent(container.getId(), RMContainerEventType.START));
+        new RMContainerEvent(containerId, RMContainerEventType.START));
 
 
     if (LOG.isDebugEnabled()) {
     if (LOG.isDebugEnabled()) {
       LOG.debug("allocate: applicationAttemptId=" 
       LOG.debug("allocate: applicationAttemptId=" 
-          + container.getId().getApplicationAttemptId() 
-          + " container=" + container.getId() + " host="
+          + containerId.getApplicationAttemptId()
+          + " container=" + containerId + " host="
           + container.getNodeId().getHost() + " type=" + type);
           + container.getNodeId().getHost() + " type=" + type);
     }
     }
     RMAuditLogger.logSuccess(getUser(),
     RMAuditLogger.logSuccess(getUser(),
         AuditConstants.ALLOC_CONTAINER, "SchedulerApp",
         AuditConstants.ALLOC_CONTAINER, "SchedulerApp",
-        getApplicationId(), container.getId());
+        getApplicationId(), containerId);
     
     
     return rmContainer;
     return rmContainer;
   }
   }

+ 61 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerNode.java

@@ -18,22 +18,29 @@
 
 
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica;
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica;
 
 
-
-import java.util.Set;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.Container;
+import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.Priority;
+import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
 import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
+import org.apache.hadoop.yarn.util.resource.Resources;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
 
 
 public class FiCaSchedulerNode extends SchedulerNode {
 public class FiCaSchedulerNode extends SchedulerNode {
 
 
   private static final Log LOG = LogFactory.getLog(FiCaSchedulerNode.class);
   private static final Log LOG = LogFactory.getLog(FiCaSchedulerNode.class);
+  private Map<ContainerId, RMContainer> killableContainers = new HashMap<>();
+  private Resource totalKillableResources = Resource.newInstance(0, 0);
   
   
   public FiCaSchedulerNode(RMNode node, boolean usePortForNodeName,
   public FiCaSchedulerNode(RMNode node, boolean usePortForNodeName,
       Set<String> nodeLabels) {
       Set<String> nodeLabels) {
@@ -92,7 +99,6 @@ public class FiCaSchedulerNode extends SchedulerNode {
   @Override
   @Override
   public synchronized void unreserveResource(
   public synchronized void unreserveResource(
       SchedulerApplicationAttempt application) {
       SchedulerApplicationAttempt application) {
-
     // adding NP checks as this can now be called for preemption
     // adding NP checks as this can now be called for preemption
     if (getReservedContainer() != null
     if (getReservedContainer() != null
         && getReservedContainer().getContainer() != null
         && getReservedContainer().getContainer() != null
@@ -115,4 +121,55 @@ public class FiCaSchedulerNode extends SchedulerNode {
     }
     }
     setReservedContainer(null);
     setReservedContainer(null);
   }
   }
+
+  // According to decisions from preemption policy, mark the container to killable
+  public synchronized void markContainerToKillable(ContainerId containerId) {
+    RMContainer c = launchedContainers.get(containerId);
+    if (c != null && !killableContainers.containsKey(containerId)) {
+      killableContainers.put(containerId, c);
+      Resources.addTo(totalKillableResources, c.getAllocatedResource());
+    }
+  }
+
+  // According to decisions from preemption policy, mark the container to
+  // non-killable
+  public synchronized void markContainerToNonKillable(ContainerId containerId) {
+    RMContainer c = launchedContainers.get(containerId);
+    if (c != null && killableContainers.containsKey(containerId)) {
+      killableContainers.remove(containerId);
+      Resources.subtractFrom(totalKillableResources, c.getAllocatedResource());
+    }
+  }
+
+  @Override
+  protected synchronized void updateResource(
+      Container container) {
+    super.updateResource(container);
+    if (killableContainers.containsKey(container.getId())) {
+      Resources.subtractFrom(totalKillableResources, container.getResource());
+      killableContainers.remove(container.getId());
+    }
+  }
+
+  @Override
+  protected synchronized void changeContainerResource(ContainerId containerId,
+      Resource deltaResource, boolean increase) {
+    super.changeContainerResource(containerId, deltaResource, increase);
+
+    if (killableContainers.containsKey(containerId)) {
+      if (increase) {
+        Resources.addTo(totalKillableResources, deltaResource);
+      } else {
+        Resources.subtractFrom(totalKillableResources, deltaResource);
+      }
+    }
+  }
+
+  public synchronized Resource getTotalKillableResources() {
+    return totalKillableResources;
+  }
+
+  public synchronized Map<ContainerId, RMContainer> getKillableContainers() {
+    return killableContainers;
+  }
 }
 }

+ 10 - 5
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/SchedulerEventType.java

@@ -38,10 +38,15 @@ public enum SchedulerEventType {
   // Source: ContainerAllocationExpirer
   // Source: ContainerAllocationExpirer
   CONTAINER_EXPIRED,
   CONTAINER_EXPIRED,
 
 
-  // Source: SchedulingEditPolicy
+  /* Source: SchedulingEditPolicy */
   KILL_RESERVED_CONTAINER,
   KILL_RESERVED_CONTAINER,
-  MARK_CONTAINER_FOR_PREEMPTION, // Mark a container for preemption
-                                 // in the near future
-  KILL_PREEMPTED_CONTAINER // Kill a container previously marked for
-                           // preemption
+
+  // Mark a container for preemption
+  MARK_CONTAINER_FOR_PREEMPTION,
+
+  // Mark a for-preemption container killable
+  MARK_CONTAINER_FOR_KILLABLE,
+
+  // Cancel a killable container
+  MARK_CONTAINER_FOR_NONKILLABLE
 }
 }

+ 2 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMDispatcher.java

@@ -59,7 +59,7 @@ public class TestRMDispatcher {
       rmDispatcher.getEventHandler().handle(event1);
       rmDispatcher.getEventHandler().handle(event1);
       ContainerPreemptEvent event2 =
       ContainerPreemptEvent event2 =
           new ContainerPreemptEvent(appAttemptId, container,
           new ContainerPreemptEvent(appAttemptId, container,
-            SchedulerEventType.KILL_PREEMPTED_CONTAINER);
+            SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE);
       rmDispatcher.getEventHandler().handle(event2);
       rmDispatcher.getEventHandler().handle(event2);
       ContainerPreemptEvent event3 =
       ContainerPreemptEvent event3 =
           new ContainerPreemptEvent(appAttemptId, container,
           new ContainerPreemptEvent(appAttemptId, container,
@@ -70,7 +70,7 @@ public class TestRMDispatcher {
       verify(sched, times(3)).handle(any(SchedulerEvent.class));
       verify(sched, times(3)).handle(any(SchedulerEvent.class));
       verify(sched).killReservedContainer(container);
       verify(sched).killReservedContainer(container);
       verify(sched).markContainerForPreemption(appAttemptId, container);
       verify(sched).markContainerForPreemption(appAttemptId, container);
-      verify(sched).killPreemptedContainer(container);
+      verify(sched).markContainerForKillable(container);
     } catch (InterruptedException e) {
     } catch (InterruptedException e) {
       Assert.fail();
       Assert.fail();
     } finally {
     } finally {

+ 3 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java

@@ -398,7 +398,7 @@ public class TestAMRestart {
     ContainerId amContainer =
     ContainerId amContainer =
         ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
         ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
     // Preempt the first attempt;
     // Preempt the first attempt;
-    scheduler.killPreemptedContainer(scheduler.getRMContainer(amContainer));
+    scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer));
 
 
     am1.waitForState(RMAppAttemptState.FAILED);
     am1.waitForState(RMAppAttemptState.FAILED);
     Assert.assertTrue(! attempt1.shouldCountTowardsMaxAttemptRetry());
     Assert.assertTrue(! attempt1.shouldCountTowardsMaxAttemptRetry());
@@ -414,7 +414,7 @@ public class TestAMRestart {
     // Preempt the second attempt.
     // Preempt the second attempt.
     ContainerId amContainer2 =
     ContainerId amContainer2 =
         ContainerId.newContainerId(am2.getApplicationAttemptId(), 1);
         ContainerId.newContainerId(am2.getApplicationAttemptId(), 1);
-    scheduler.killPreemptedContainer(scheduler.getRMContainer(amContainer2));
+    scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer2));
 
 
     am2.waitForState(RMAppAttemptState.FAILED);
     am2.waitForState(RMAppAttemptState.FAILED);
     Assert.assertTrue(! attempt2.shouldCountTowardsMaxAttemptRetry());
     Assert.assertTrue(! attempt2.shouldCountTowardsMaxAttemptRetry());
@@ -508,7 +508,7 @@ public class TestAMRestart {
         ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
         ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
 
 
     // Forcibly preempt the am container;
     // Forcibly preempt the am container;
-    scheduler.killPreemptedContainer(scheduler.getRMContainer(amContainer));
+    scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer));
 
 
     am1.waitForState(RMAppAttemptState.FAILED);
     am1.waitForState(RMAppAttemptState.FAILED);
     Assert.assertTrue(! attempt1.shouldCountTowardsMaxAttemptRetry());
     Assert.assertTrue(! attempt1.shouldCountTowardsMaxAttemptRetry());

+ 4 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicy.java

@@ -23,7 +23,7 @@ import static org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.Pro
 import static org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy.OBSERVE_ONLY;
 import static org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy.OBSERVE_ONLY;
 import static org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy.TOTAL_PREEMPTION_PER_ROUND;
 import static org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy.TOTAL_PREEMPTION_PER_ROUND;
 import static org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy.WAIT_TIME_BEFORE_KILL;
 import static org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy.WAIT_TIME_BEFORE_KILL;
-import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType.KILL_PREEMPTED_CONTAINER;
+import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE;
 import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType.MARK_CONTAINER_FOR_PREEMPTION;
 import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType.MARK_CONTAINER_FOR_PREEMPTION;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNotNull;
@@ -75,6 +75,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.Capacity
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ContainerPreemptEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ContainerPreemptEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
@@ -167,6 +168,7 @@ public class TestProportionalCapacityPreemptionPolicy {
     when(mCS.getConfiguration()).thenReturn(schedConf);
     when(mCS.getConfiguration()).thenReturn(schedConf);
     rmContext = mock(RMContext.class);
     rmContext = mock(RMContext.class);
     when(mCS.getRMContext()).thenReturn(rmContext);
     when(mCS.getRMContext()).thenReturn(rmContext);
+    when(mCS.getPreemptionManager()).thenReturn(new PreemptionManager());
     when(rmContext.getNodeLabelManager()).thenReturn(lm);
     when(rmContext.getNodeLabelManager()).thenReturn(lm);
     mDisp = mock(EventHandler.class);
     mDisp = mock(EventHandler.class);
     Dispatcher disp = mock(Dispatcher.class);
     Dispatcher disp = mock(Dispatcher.class);
@@ -289,7 +291,7 @@ public class TestProportionalCapacityPreemptionPolicy {
     List<ContainerPreemptEvent> events = evtCaptor.getAllValues();
     List<ContainerPreemptEvent> events = evtCaptor.getAllValues();
     for (ContainerPreemptEvent e : events.subList(20, 20)) {
     for (ContainerPreemptEvent e : events.subList(20, 20)) {
       assertEquals(appC, e.getAppId());
       assertEquals(appC, e.getAppId());
-      assertEquals(KILL_PREEMPTED_CONTAINER, e.getType());
+      assertEquals(MARK_CONTAINER_FOR_KILLABLE, e.getType());
     }
     }
   }
   }
 
 

+ 2 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyForNodePartitions.java

@@ -67,6 +67,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.Capacity
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.OrderingPolicy;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.OrderingPolicy;
@@ -123,6 +124,7 @@ public class TestProportionalCapacityPreemptionPolicyForNodePartitions {
     mClock = mock(Clock.class);
     mClock = mock(Clock.class);
     cs = mock(CapacityScheduler.class);
     cs = mock(CapacityScheduler.class);
     when(cs.getResourceCalculator()).thenReturn(rc);
     when(cs.getResourceCalculator()).thenReturn(rc);
+    when(cs.getPreemptionManager()).thenReturn(new PreemptionManager());
 
 
     nlm = mock(RMNodeLabelsManager.class);
     nlm = mock(RMNodeLabelsManager.class);
     mDisp = mock(EventHandler.class);
     mDisp = mock(EventHandler.class);

+ 2 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java

@@ -36,6 +36,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
 import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
@@ -265,6 +266,7 @@ public class TestApplicationLimits {
         thenReturn(CapacityScheduler.nonPartitionedQueueComparator);
         thenReturn(CapacityScheduler.nonPartitionedQueueComparator);
     when(csContext.getResourceCalculator()).thenReturn(resourceCalculator);
     when(csContext.getResourceCalculator()).thenReturn(resourceCalculator);
     when(csContext.getRMContext()).thenReturn(rmContext);
     when(csContext.getRMContext()).thenReturn(rmContext);
+    when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
     
     
     // Say cluster has 100 nodes of 16G each
     // Say cluster has 100 nodes of 16G each
     Resource clusterResource = 
     Resource clusterResource = 

+ 3 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationPriority.java

@@ -209,7 +209,7 @@ public class TestApplicationPriority {
       if (++counter > 2) {
       if (++counter > 2) {
         break;
         break;
       }
       }
-      cs.killPreemptedContainer(schedulerAppAttempt.getRMContainer(c.getId()));
+      cs.markContainerForKillable(schedulerAppAttempt.getRMContainer(c.getId()));
     }
     }
 
 
     // check node report, 12 GB used and 4 GB available
     // check node report, 12 GB used and 4 GB available
@@ -517,7 +517,7 @@ public class TestApplicationPriority {
       if (++counter > 2) {
       if (++counter > 2) {
         break;
         break;
       }
       }
-      cs.killPreemptedContainer(schedulerAppAttemptApp1.getRMContainer(c.getId()));
+      cs.markContainerForKillable(schedulerAppAttemptApp1.getRMContainer(c.getId()));
       iterator.remove();
       iterator.remove();
     }
     }
 
 
@@ -547,7 +547,7 @@ public class TestApplicationPriority {
       if (++counter > 1) {
       if (++counter > 1) {
         break;
         break;
       }
       }
-      cs.killPreemptedContainer(schedulerAppAttemptApp1.getRMContainer(c.getId()));
+      cs.markContainerForKillable(schedulerAppAttemptApp1.getRMContainer(c.getId()));
       iterator.remove();
       iterator.remove();
     }
     }
 
 

+ 4 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java

@@ -1255,7 +1255,7 @@ public class TestCapacityScheduler {
 
 
     // kill the 3 containers
     // kill the 3 containers
     for (Container c : allocatedContainers) {
     for (Container c : allocatedContainers) {
-      cs.killPreemptedContainer(schedulerAppAttempt.getRMContainer(c.getId()));
+      cs.markContainerForKillable(schedulerAppAttempt.getRMContainer(c.getId()));
     }
     }
 
 
     // check values
     // check values
@@ -1264,7 +1264,7 @@ public class TestCapacityScheduler {
         Resource.newInstance(CONTAINER_MEMORY * 3, 3), false, 3);
         Resource.newInstance(CONTAINER_MEMORY * 3, 3), false, 3);
 
 
     // kill app0-attempt0 AM container
     // kill app0-attempt0 AM container
-    cs.killPreemptedContainer(schedulerAppAttempt.getRMContainer(app0
+    cs.markContainerForKillable(schedulerAppAttempt.getRMContainer(app0
         .getCurrentAppAttempt().getMasterContainer().getId()));
         .getCurrentAppAttempt().getMasterContainer().getId()));
 
 
     // wait for app0 failed
     // wait for app0 failed
@@ -1287,7 +1287,7 @@ public class TestCapacityScheduler {
     allocatedContainers =
     allocatedContainers =
         am1.allocateAndWaitForContainers(3, CONTAINER_MEMORY, nm1);
         am1.allocateAndWaitForContainers(3, CONTAINER_MEMORY, nm1);
     for (Container c : allocatedContainers) {
     for (Container c : allocatedContainers) {
-      cs.killPreemptedContainer(schedulerAppAttempt.getRMContainer(c.getId()));
+      cs.markContainerForKillable(schedulerAppAttempt.getRMContainer(c.getId()));
     }
     }
 
 
     // check values
     // check values
@@ -1336,7 +1336,7 @@ public class TestCapacityScheduler {
     }
     }
 
 
     // Call killContainer to preempt the container
     // Call killContainer to preempt the container
-    cs.killPreemptedContainer(rmContainer);
+    cs.markContainerForKillable(rmContainer);
 
 
     Assert.assertEquals(3, requests.size());
     Assert.assertEquals(3, requests.size());
     for (ResourceRequest request : requests) {
     for (ResourceRequest request : requests) {

+ 677 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerPreemption.java

@@ -0,0 +1,677 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
+
+import com.google.common.collect.Sets;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.service.Service;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Priority;
+import org.apache.hadoop.yarn.api.records.ResourceRequest;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
+import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
+import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
+import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.RMActiveServices;
+import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy;
+import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingMonitor;
+import org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy;
+import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager;
+import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
+import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
+import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
+import org.apache.hadoop.yarn.util.Clock;
+import org.apache.hadoop.yarn.util.resource.Resources;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+public class TestCapacitySchedulerPreemption {
+  private static final Log LOG = LogFactory.getLog(
+      TestCapacitySchedulerPreemption.class);
+
+  private final int GB = 1024;
+
+  private Configuration conf;
+
+  RMNodeLabelsManager mgr;
+
+  Clock clock;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = new YarnConfiguration();
+    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
+        ResourceScheduler.class);
+    conf.setBoolean(YarnConfiguration.RM_SCHEDULER_ENABLE_MONITORS, true);
+    conf.setClass(YarnConfiguration.RM_SCHEDULER_MONITOR_POLICIES,
+        ProportionalCapacityPreemptionPolicy.class, SchedulingEditPolicy.class);
+    conf = TestUtils.getConfigurationWithMultipleQueues(this.conf);
+
+    // Set preemption related configurations
+    conf.setInt(ProportionalCapacityPreemptionPolicy.WAIT_TIME_BEFORE_KILL,
+        0);
+    conf.setBoolean(CapacitySchedulerConfiguration.LAZY_PREEMPTION_ENALBED,
+        true);
+    conf.setFloat(
+        ProportionalCapacityPreemptionPolicy.TOTAL_PREEMPTION_PER_ROUND, 1.0f);
+    conf.setFloat(
+        ProportionalCapacityPreemptionPolicy.NATURAL_TERMINATION_FACTOR, 1.0f);
+    mgr = new NullRMNodeLabelsManager();
+    mgr.init(this.conf);
+    clock = mock(Clock.class);
+    when(clock.getTime()).thenReturn(0L);
+  }
+
+  private SchedulingEditPolicy getSchedulingEditPolicy(MockRM rm) {
+    RMActiveServices activeServices = rm.getRMActiveService();
+    SchedulingMonitor mon = null;
+    for (Service service : activeServices.getServices()) {
+      if (service instanceof SchedulingMonitor) {
+        mon = (SchedulingMonitor) service;
+        break;
+      }
+    }
+
+    if (mon != null) {
+      return mon.getSchedulingEditPolicy();
+    }
+    return null;
+  }
+
+  @Test (timeout = 60000)
+  public void testSimplePreemption() throws Exception {
+    /**
+     * Test case: Submit two application (app1/app2) to different queues, queue
+     * structure:
+     *
+     * <pre>
+     *             Root
+     *            /  |  \
+     *           a   b   c
+     *          10   20  70
+     * </pre>
+     *
+     * 1) Two nodes in the cluster, each of them has 4G.
+     *
+     * 2) app1 submit to queue-a first, it asked 7 * 1G containers, so there's no
+     * more resource available.
+     *
+     * 3) app2 submit to queue-c, ask for one 1G container (for AM)
+     *
+     * Now the cluster is fulfilled.
+     *
+     * 4) app2 asks for another 1G container, system will preempt one container
+     * from app1, and app2 will receive the preempted container
+     */
+    MockRM rm1 = new MockRM(conf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+
+    MockNM nm1 = rm1.registerNode("h1:1234", 4 * GB);
+    MockNM nm2 = rm1.registerNode("h2:1234", 4 * GB);
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+    RMNode rmNode2 = rm1.getRMContext().getRMNodes().get(nm2.getNodeId());
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "a");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    am1.allocate("*", 1 * GB, 7, new ArrayList<ContainerId>());
+
+    // Do allocation 3 times for node1/node2
+    for (int i = 0; i < 3; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+    }
+
+    // App1 should have 7 containers now, and no available resource for cluster
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+
+    // Submit app2 to queue-c and asks for a 1G container for AM
+    RMApp app2 = rm1.submitApp(1 * GB, "app", "user", null, "c");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm2);
+
+    // NM1/NM2 has available resource = 0G
+    Assert.assertEquals(0 * GB, cs.getNode(nm1.getNodeId())
+        .getAvailableResource().getMemory());
+    Assert.assertEquals(0 * GB, cs.getNode(nm2.getNodeId())
+        .getAvailableResource().getMemory());
+
+    // AM asks for a 1 * GB container
+    am2.allocate(Arrays.asList(ResourceRequest
+        .newInstance(Priority.newInstance(1), ResourceRequest.ANY,
+            Resources.createResource(1 * GB), 1)), null);
+
+    // Get edit policy and do one update
+    SchedulingEditPolicy editPolicy = getSchedulingEditPolicy(rm1);
+
+    // Call edit schedule twice, and check if one container from app1 marked
+    // to be "killable"
+    editPolicy.editSchedule();
+    editPolicy.editSchedule();
+
+    PreemptionManager pm = cs.getPreemptionManager();
+    Map<ContainerId, RMContainer> killableContainers =
+        waitKillableContainersSize(pm, "a", RMNodeLabelsManager.NO_LABEL, 1);
+    Assert.assertEquals(1, killableContainers.size());
+    Assert.assertEquals(killableContainers.entrySet().iterator().next().getKey()
+        .getApplicationAttemptId(), am1.getApplicationAttemptId());
+
+    // Call CS.handle once to see if container preempted
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+
+    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(
+        am2.getApplicationAttemptId());
+
+    // App1 has 6 containers, and app2 has 2 containers
+    Assert.assertEquals(6, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(2, schedulerApp2.getLiveContainers().size());
+
+    rm1.close();
+  }
+
+  @Test (timeout = 60000)
+  public void testPreemptionConsidersNodeLocalityDelay()
+      throws Exception {
+    /**
+     * Test case: same as testSimplePreemption steps 1-3.
+     *
+     * Step 4: app2 asks for 1G container with locality specified, so it needs
+     * to wait for missed-opportunity before get scheduled.
+     * Check if system waits missed-opportunity before finish killable container
+     */
+    MockRM rm1 = new MockRM(conf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+    MockNM nm1 = rm1.registerNode("h1:1234", 4 * GB);
+    MockNM nm2 = rm1.registerNode("h2:1234", 4 * GB);
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+    RMNode rmNode2 = rm1.getRMContext().getRMNodes().get(nm2.getNodeId());
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "a");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    am1.allocate("*", 1 * GB, 6, new ArrayList<ContainerId>());
+
+    // Do allocation 3 times for node1/node2
+    for (int i = 0; i < 3; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+    }
+
+    // App1 should have 7 containers now, and no available resource for cluster
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+
+    // Submit app2 to queue-c and asks for a 1G container for AM
+    RMApp app2 = rm1.submitApp(1 * GB, "app", "user", null, "c");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm2);
+
+    // NM1/NM2 has available resource = 0G
+    Assert.assertEquals(0 * GB, cs.getNode(nm1.getNodeId())
+        .getAvailableResource().getMemory());
+    Assert.assertEquals(0 * GB, cs.getNode(nm2.getNodeId())
+        .getAvailableResource().getMemory());
+
+    // AM asks for a 1 * GB container with unknown host and unknown rack
+    am2.allocate(Arrays.asList(ResourceRequest
+        .newInstance(Priority.newInstance(1), ResourceRequest.ANY,
+            Resources.createResource(1 * GB), 1), ResourceRequest
+        .newInstance(Priority.newInstance(1), "unknownhost",
+            Resources.createResource(1 * GB), 1), ResourceRequest
+        .newInstance(Priority.newInstance(1), "/default-rack",
+            Resources.createResource(1 * GB), 1)), null);
+
+    // Get edit policy and do one update
+    SchedulingEditPolicy editPolicy = getSchedulingEditPolicy(rm1);
+
+    // Call edit schedule twice, and check if one container from app1 marked
+    // to be "killable"
+    editPolicy.editSchedule();
+    editPolicy.editSchedule();
+
+    PreemptionManager pm = cs.getPreemptionManager();
+    Map<ContainerId, RMContainer> killableContainers =
+        waitKillableContainersSize(pm, "a", RMNodeLabelsManager.NO_LABEL, 1);
+    Assert.assertEquals(killableContainers.entrySet().iterator().next().getKey()
+        .getApplicationAttemptId(), am1.getApplicationAttemptId());
+
+    // Call CS.handle once to see if container preempted
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+
+    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(
+        am2.getApplicationAttemptId());
+
+    // App1 has 7 containers, and app2 has 1 containers (no container preempted)
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp2.getLiveContainers().size());
+
+    // Do allocation again, one container will be preempted
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+
+    // App1 has 6 containers, and app2 has 2 containers (new container allocated)
+    Assert.assertEquals(6, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(2, schedulerApp2.getLiveContainers().size());
+
+    rm1.close();
+  }
+
+  @Test (timeout = 60000)
+  public void testPreemptionConsidersHardNodeLocality()
+      throws Exception {
+    /**
+     * Test case: same as testSimplePreemption steps 1-3.
+     *
+     * Step 4: app2 asks for 1G container with hard locality specified, and
+     *         asked host is not existed
+     * Confirm system doesn't preempt any container.
+     */
+    MockRM rm1 = new MockRM(conf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+    MockNM nm1 = rm1.registerNode("h1:1234", 4 * GB);
+    MockNM nm2 = rm1.registerNode("h2:1234", 4 * GB);
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+    RMNode rmNode2 = rm1.getRMContext().getRMNodes().get(nm2.getNodeId());
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "a");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    am1.allocate("*", 1 * GB, 6, new ArrayList<ContainerId>());
+
+    // Do allocation 3 times for node1/node2
+    for (int i = 0; i < 3; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    }
+    for (int i = 0; i < 3; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+    }
+
+    // App1 should have 7 containers now, and no available resource for cluster
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+
+    // Submit app2 to queue-c and asks for a 1G container for AM
+    RMApp app2 = rm1.submitApp(1 * GB, "app", "user", null, "c");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm2);
+
+    // NM1/NM2 has available resource = 0G
+    Assert.assertEquals(0 * GB, cs.getNode(nm1.getNodeId())
+        .getAvailableResource().getMemory());
+    Assert.assertEquals(0 * GB, cs.getNode(nm2.getNodeId())
+        .getAvailableResource().getMemory());
+
+    // AM asks for a 1 * GB container for h3 with hard locality,
+    // h3 doesn't exist in the cluster
+    am2.allocate(Arrays.asList(ResourceRequest
+        .newInstance(Priority.newInstance(1), ResourceRequest.ANY,
+            Resources.createResource(1 * GB), 1, true), ResourceRequest
+        .newInstance(Priority.newInstance(1), "h3",
+            Resources.createResource(1 * GB), 1, false), ResourceRequest
+        .newInstance(Priority.newInstance(1), "/default-rack",
+            Resources.createResource(1 * GB), 1, false)), null);
+
+    // Get edit policy and do one update
+    SchedulingEditPolicy editPolicy = getSchedulingEditPolicy(rm1);
+
+    // Call edit schedule twice, and check if one container from app1 marked
+    // to be "killable"
+    editPolicy.editSchedule();
+    editPolicy.editSchedule();
+
+    PreemptionManager pm = cs.getPreemptionManager();
+    Map<ContainerId, RMContainer> killableContainers =
+        waitKillableContainersSize(pm, "a", RMNodeLabelsManager.NO_LABEL, 1);
+    Assert.assertEquals(killableContainers.entrySet().iterator().next().getKey()
+        .getApplicationAttemptId(), am1.getApplicationAttemptId());
+
+    // Call CS.handle once to see if container preempted
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+
+    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(
+        am2.getApplicationAttemptId());
+
+    // App1 has 7 containers, and app2 has 1 containers (no container preempted)
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp2.getLiveContainers().size());
+
+    // Do allocation again, nothing will be preempted
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+
+    // App1 has 7 containers, and app2 has 1 containers (no container allocated)
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp2.getLiveContainers().size());
+
+    rm1.close();
+  }
+
+  @Test (timeout = 60000)
+  public void testPreemptionPolicyShouldRespectAlreadyMarkedKillableContainers()
+      throws Exception {
+    /**
+     * Test case:
+     * <pre>
+     *             Root
+     *            /  |  \
+     *           a   b   c
+     *          10   20  70
+     * </pre>
+     * Submit applications to two queues, one uses more than the other, so
+     * preemption will happen.
+     *
+     * Check:
+     * 1) Killable containers resources will be excluded from PCPP (no duplicated
+     *    container added to killable list)
+     * 2) When more resources need to be preempted, new containers will be selected
+     *    and killable containers will be considered
+     */
+    MockRM rm1 = new MockRM(conf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+    MockNM nm1 = rm1.registerNode("h1:1234", 8 * GB);
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "a");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    am1.allocate("*", 1 * GB, 6, new ArrayList<ContainerId>());
+
+    // Do allocation 6 times for node1
+    for (int i = 0; i < 6; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    }
+
+    // App1 should have 7 containers now, and no available resource for cluster
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+
+    // Submit app2 to queue-c and asks for a 1G container for AM
+    RMApp app2 = rm1.submitApp(1 * GB, "app", "user", null, "c");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm1);
+
+    // NM1 has available resource = 0G
+    Assert.assertEquals(0 * GB, cs.getNode(nm1.getNodeId())
+        .getAvailableResource().getMemory());
+    am2.allocate("*", 1 * GB, 1, new ArrayList<ContainerId>());
+
+    // Get edit policy and do one update
+    ProportionalCapacityPreemptionPolicy editPolicy =
+        (ProportionalCapacityPreemptionPolicy) getSchedulingEditPolicy(rm1);
+
+    // Call edit schedule twice, and check if one container from app1 marked
+    // to be "killable"
+    editPolicy.editSchedule();
+    editPolicy.editSchedule();
+
+    PreemptionManager pm = cs.getPreemptionManager();
+    waitKillableContainersSize(pm, "a", RMNodeLabelsManager.NO_LABEL, 1);
+
+    // Check killable containers and to-be-preempted containers in edit policy
+    Assert.assertEquals(0, editPolicy.getToPreemptContainers().size());
+
+    // Run edit schedule again, confirm status doesn't changed
+    editPolicy.editSchedule();
+    Assert.assertEquals(0, editPolicy.getToPreemptContainers().size());
+
+    // Save current to kill containers
+    Set<ContainerId> previousKillableContainers = new HashSet<>(
+        pm.getKillableContainersMap("a", RMNodeLabelsManager.NO_LABEL)
+            .keySet());
+
+    // Update request resource of c from 1 to 2, so we need to preempt
+    // one more container
+    am2.allocate("*", 1 * GB, 2, new ArrayList<ContainerId>());
+
+    // Call editPolicy.editSchedule() once, we should have 1 container in to-preempt map
+    // and 1 container in killable map
+    editPolicy.editSchedule();
+    Assert.assertEquals(1, editPolicy.getToPreemptContainers().size());
+
+    // Call editPolicy.editSchedule() once more, we should have 2 containers killable map
+    editPolicy.editSchedule();
+    Assert.assertEquals(0, editPolicy.getToPreemptContainers().size());
+
+    // Check if previous killable containers included by new killable containers
+    Map<ContainerId, RMContainer> killableContainers =
+        waitKillableContainersSize(pm, "a", RMNodeLabelsManager.NO_LABEL, 2);
+    Assert.assertTrue(
+        Sets.difference(previousKillableContainers, killableContainers.keySet())
+            .isEmpty());
+  }
+
+  @Test (timeout = 60000)
+  public void testPreemptionPolicyCleanupKillableContainersWhenNoPreemptionNeeded()
+      throws Exception {
+    /**
+     * Test case:
+     * <pre>
+     *             Root
+     *            /  |  \
+     *           a   b   c
+     *          10   20  70
+     * </pre>
+     * Submit applications to two queues, one uses more than the other, so
+     * preemption will happen.
+     *
+     * Check:
+     * 1) Containers will be marked to killable
+     * 2) Cancel resource request
+     * 3) Killable containers will be cancelled from policy and scheduler
+     */
+    MockRM rm1 = new MockRM(conf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+    MockNM nm1 = rm1.registerNode("h1:1234", 8 * GB);
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "a");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    am1.allocate("*", 1 * GB, 6, new ArrayList<ContainerId>());
+
+    // Do allocation 6 times for node1
+    for (int i = 0; i < 6; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    }
+
+    // App1 should have 7 containers now, and no available resource for cluster
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+
+    // Submit app2 to queue-c and asks for a 1G container for AM
+    RMApp app2 = rm1.submitApp(1 * GB, "app", "user", null, "c");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm1);
+
+    // NM1 has available resource = 0G
+    Assert.assertEquals(0 * GB, cs.getNode(nm1.getNodeId())
+        .getAvailableResource().getMemory());
+    am2.allocate("*", 3 * GB, 1, new ArrayList<ContainerId>());
+
+    // Get edit policy and do one update
+    ProportionalCapacityPreemptionPolicy editPolicy =
+        (ProportionalCapacityPreemptionPolicy) getSchedulingEditPolicy(rm1);
+
+    // Call edit schedule twice, and check if 3 container from app1 marked
+    // to be "killable"
+    editPolicy.editSchedule();
+    editPolicy.editSchedule();
+
+    PreemptionManager pm = cs.getPreemptionManager();
+    waitKillableContainersSize(pm, "a", RMNodeLabelsManager.NO_LABEL, 3);
+
+    // Change reqeust from 3G to 2G, now we can preempt one less container. (3->2)
+    am2.allocate("*", 2 * GB, 1, new ArrayList<ContainerId>());
+    editPolicy.editSchedule();
+    Assert.assertEquals(0, editPolicy.getToPreemptContainers().size());
+    waitKillableContainersSize(pm, "a", RMNodeLabelsManager.NO_LABEL, 2);
+
+    // Call editSchedule once more to make sure still nothing happens
+    editPolicy.editSchedule();
+    Assert.assertEquals(0, editPolicy.getToPreemptContainers().size());
+    waitKillableContainersSize(pm, "a", RMNodeLabelsManager.NO_LABEL, 2);
+  }
+
+  @Test (timeout = 60000)
+  public void testPreemptionConsidersUserLimit()
+      throws Exception {
+    /**
+     * Test case: Submit two application (app1/app2) to different queues, queue
+     * structure:
+     *
+     * <pre>
+     *             Root
+     *            /  |  \
+     *           a   b   c
+     *          10   20  70
+     * </pre>
+     *
+     * Queue-c's user-limit-factor = 0.1, so single user cannot allocate >1 containers in queue-c
+     *
+     * 1) Two nodes in the cluster, each of them has 4G.
+     *
+     * 2) app1 submit to queue-a first, it asked 7 * 1G containers, so there's no
+     * more resource available.
+     *
+     * 3) app2 submit to queue-c, ask for one 1G container (for AM)
+     *
+     * Now the cluster is fulfilled.
+     *
+     * 4) app2 asks for another 1G container, system will preempt one container
+     * from app1, and app2 will receive the preempted container
+     */
+    CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration(conf);
+    csConf.setUserLimitFactor(CapacitySchedulerConfiguration.ROOT + ".c", 0.1f);
+    MockRM rm1 = new MockRM(csConf);
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+    MockNM nm1 = rm1.registerNode("h1:1234", 4 * GB);
+    MockNM nm2 = rm1.registerNode("h2:1234", 4 * GB);
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+    RMNode rmNode2 = rm1.getRMContext().getRMNodes().get(nm2.getNodeId());
+
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "a");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    am1.allocate("*", 1 * GB, 6, new ArrayList<ContainerId>());
+
+    // Do allocation 3 times for node1/node2
+    for (int i = 0; i < 3; i++) {
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+    }
+
+    // App1 should have 7 containers now, and no available resource for cluster
+    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
+        am1.getApplicationAttemptId());
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+
+    // Submit app2 to queue-c and asks for a 1G container for AM
+    RMApp app2 = rm1.submitApp(1 * GB, "app", "user", null, "c");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm2);
+
+    // NM1/NM2 has available resource = 0G
+    Assert.assertEquals(0 * GB, cs.getNode(nm1.getNodeId())
+        .getAvailableResource().getMemory());
+    Assert.assertEquals(0 * GB, cs.getNode(nm2.getNodeId())
+        .getAvailableResource().getMemory());
+
+    // AM asks for a 1 * GB container
+    am2.allocate(Arrays.asList(ResourceRequest
+        .newInstance(Priority.newInstance(1), ResourceRequest.ANY,
+            Resources.createResource(1 * GB), 1)), null);
+
+    // Get edit policy and do one update
+    SchedulingEditPolicy editPolicy = getSchedulingEditPolicy(rm1);
+
+    // Call edit schedule twice, and check if no container from app1 marked
+    // to be "killable"
+    editPolicy.editSchedule();
+    editPolicy.editSchedule();
+
+    // No preemption happens
+    PreemptionManager pm = cs.getPreemptionManager();
+    Map<ContainerId, RMContainer> killableContainers =
+        waitKillableContainersSize(pm, "a", RMNodeLabelsManager.NO_LABEL, 0);
+    Assert.assertEquals(0, killableContainers.size());
+
+    // Call CS.handle once to see if container preempted
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
+
+    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(
+        am2.getApplicationAttemptId());
+
+    // App1 has 7 containers, and app2 has 1 containers (nothing preempted)
+    Assert.assertEquals(7, schedulerApp1.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp2.getLiveContainers().size());
+
+    rm1.close();
+  }
+
+  private Map<ContainerId, RMContainer> waitKillableContainersSize(
+      PreemptionManager pm, String queueName, String partition,
+      int expectedSize) throws InterruptedException {
+    Map<ContainerId, RMContainer> killableContainers =
+        pm.getKillableContainersMap(queueName, partition);
+
+    int wait = 0;
+    // Wait for at most 5 sec (it should be super fast actually)
+    while (expectedSize != killableContainers.size() && wait < 500) {
+      killableContainers = pm.getKillableContainersMap(queueName, partition);
+      Thread.sleep(10);
+      wait++;
+    }
+
+    Assert.assertEquals(expectedSize, killableContainers.size());
+    return killableContainers;
+  }
+}

+ 2 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestChildQueueOrder.java

@@ -51,6 +51,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEven
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
@@ -99,6 +100,7 @@ public class TestChildQueueOrder {
     when(csContext.getResourceCalculator()).
     when(csContext.getResourceCalculator()).
     thenReturn(resourceComparator);
     thenReturn(resourceComparator);
     when(csContext.getRMContext()).thenReturn(rmContext);
     when(csContext.getRMContext()).thenReturn(rmContext);
+    when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
   }
   }
 
 
   private FiCaSchedulerApp getMockApplication(int appId, String user) {
   private FiCaSchedulerApp getMockApplication(int appId, String user) {

+ 3 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java

@@ -51,6 +51,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue.User;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue.User;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent;
@@ -170,6 +171,7 @@ public class TestLeafQueue {
     when(csContext.getResourceCalculator()).
     when(csContext.getResourceCalculator()).
         thenReturn(resourceCalculator);
         thenReturn(resourceCalculator);
     when(csContext.getResourceCalculator()).thenReturn(rC);
     when(csContext.getResourceCalculator()).thenReturn(rC);
+    when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
     when(csContext.getRMContext()).thenReturn(rmContext);
     when(csContext.getRMContext()).thenReturn(rmContext);
     RMContainerTokenSecretManager containerTokenSecretManager =
     RMContainerTokenSecretManager containerTokenSecretManager =
         new RMContainerTokenSecretManager(conf);
         new RMContainerTokenSecretManager(conf);
@@ -3284,6 +3286,7 @@ public class TestLeafQueue {
         Resources.createResource(GB, 1));
         Resources.createResource(GB, 1));
     when(csContext.getMaximumResourceCapability()).thenReturn(
     when(csContext.getMaximumResourceCapability()).thenReturn(
         Resources.createResource(2 * GB, 2));
         Resources.createResource(2 * GB, 2));
+    when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
     return csContext;
     return csContext;
   }
   }
 
 

+ 97 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestNodeLabelContainerAllocation.java

@@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
 import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
 import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
+import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
@@ -1887,4 +1888,100 @@ public class TestNodeLabelContainerAllocation {
 
 
     rm1.close();
     rm1.close();
   }
   }
+
+  @Test
+  public void testParentQueueMaxCapsAreRespected() throws Exception {
+    /*
+     * Queue tree:
+     *          Root
+     *        /     \
+     *       A       B
+     *      / \
+     *     A1 A2
+     *
+     * A has 50% capacity and 50% max capacity (of label=x)
+     * A1/A2 has 50% capacity and 100% max capacity (of label=x)
+     * Cluster has one node (label=x) with resource = 24G.
+     * So we can at most use 12G resources under queueA.
+     */
+    CapacitySchedulerConfiguration csConf =
+        new CapacitySchedulerConfiguration(this.conf);
+
+    // Define top-level queues
+    csConf.setQueues(CapacitySchedulerConfiguration.ROOT, new String[] { "a",
+        "b"});
+    csConf.setCapacityByLabel(CapacitySchedulerConfiguration.ROOT, "x", 100);
+
+    final String A = CapacitySchedulerConfiguration.ROOT + ".a";
+    csConf.setCapacity(A, 10);
+    csConf.setAccessibleNodeLabels(A, toSet("x"));
+    csConf.setCapacityByLabel(A, "x", 50);
+    csConf.setMaximumCapacityByLabel(A, "x", 50);
+
+    final String B = CapacitySchedulerConfiguration.ROOT + ".b";
+    csConf.setCapacity(B, 90);
+    csConf.setAccessibleNodeLabels(B, toSet("x"));
+    csConf.setCapacityByLabel(B, "x", 50);
+    csConf.setMaximumCapacityByLabel(B, "x", 50);
+
+    // Define 2nd-level queues
+    csConf.setQueues(A, new String[] { "a1",
+        "a2"});
+
+    final String A1 = A + ".a1";
+    csConf.setCapacity(A1, 50);
+    csConf.setAccessibleNodeLabels(A1, toSet("x"));
+    csConf.setCapacityByLabel(A1, "x", 50);
+    csConf.setMaximumCapacityByLabel(A1, "x", 100);
+    csConf.setUserLimitFactor(A1, 100.0f);
+
+    final String A2 = A + ".a2";
+    csConf.setCapacity(A2, 50);
+    csConf.setAccessibleNodeLabels(A2, toSet("x"));
+    csConf.setCapacityByLabel(A2, "x", 50);
+    csConf.setMaximumCapacityByLabel(A2, "x", 100);
+    csConf.setUserLimitFactor(A2, 100.0f);
+
+    // set node -> label
+    mgr.addToCluserNodeLabels(ImmutableSet.of(
+        NodeLabel.newInstance("x", false)));
+    mgr.addLabelsToNode(ImmutableMap.of(NodeId.newInstance("h1", 0), toSet("x")));
+
+    // inject node label manager
+    MockRM rm = new MockRM(csConf) {
+      @Override
+      public RMNodeLabelsManager createNodeLabelManager() {
+        return mgr;
+      }
+    };
+
+    rm.getRMContext().setNodeLabelManager(mgr);
+    rm.start();
+
+    CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();
+
+    MockNM nm1 =
+        new MockNM("h1:1234", 24 * GB, rm.getResourceTrackerService());
+    nm1.registerNode();
+
+    // Launch app1 in a1, resource usage is 1GB (am) + 4GB * 2 = 9GB
+    RMApp app1 = rm.submitApp(1 * GB, "app", "user", null, "a1", "x");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm, nm1);
+    am1.allocate("*", 4 * GB, 2, new ArrayList<ContainerId>(), "x");
+    doNMHeartbeat(rm, nm1.getNodeId(), 10);
+    checkNumOfContainersInAnAppOnGivenNode(3, nm1.getNodeId(),
+        cs.getApplicationAttempt(am1.getApplicationAttemptId()));
+
+    // Try to launch app2 in a2, asked 2GB, should success
+    RMApp app2 = rm.submitApp(2 * GB, "app", "user", null, "a2", "x");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm, nm1);
+
+    // am2 asks more resources, cannot success because current used = 9G (app1)
+    // + 2G (app2) = 11G, and queue's max capacity = 12G
+    am2.allocate("*", 2 * GB, 2, new ArrayList<ContainerId>(), "x");
+
+    doNMHeartbeat(rm, nm1.getNodeId(), 10);
+    checkNumOfContainersInAnAppOnGivenNode(1, nm1.getNodeId(),
+        cs.getApplicationAttempt(am2.getApplicationAttemptId()));
+  }
 }
 }

+ 2 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestParentQueue.java

@@ -48,6 +48,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
@@ -93,6 +94,7 @@ public class TestParentQueue {
         thenReturn(Resources.createResource(100 * 16 * GB, 100 * 32));
         thenReturn(Resources.createResource(100 * 16 * GB, 100 * 32));
     when(csContext.getNonPartitionedQueueComparator()).
     when(csContext.getNonPartitionedQueueComparator()).
     thenReturn(CapacityScheduler.nonPartitionedQueueComparator);
     thenReturn(CapacityScheduler.nonPartitionedQueueComparator);
+    when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
     when(csContext.getResourceCalculator()).
     when(csContext.getResourceCalculator()).
     thenReturn(resourceComparator);
     thenReturn(resourceComparator);
     when(csContext.getRMContext()).thenReturn(rmContext);
     when(csContext.getRMContext()).thenReturn(rmContext);

+ 2 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestReservations.java

@@ -56,6 +56,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManage
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt.AMState;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt.AMState;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptionManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
 import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
@@ -127,6 +128,7 @@ public class TestReservations {
     when(csContext.getNonPartitionedQueueComparator()).thenReturn(
     when(csContext.getNonPartitionedQueueComparator()).thenReturn(
         CapacityScheduler.nonPartitionedQueueComparator);
         CapacityScheduler.nonPartitionedQueueComparator);
     when(csContext.getResourceCalculator()).thenReturn(resourceCalculator);
     when(csContext.getResourceCalculator()).thenReturn(resourceCalculator);
+    when(csContext.getPreemptionManager()).thenReturn(new PreemptionManager());
     when(csContext.getRMContext()).thenReturn(rmContext);
     when(csContext.getRMContext()).thenReturn(rmContext);
     RMContainerTokenSecretManager containerTokenSecretManager = new RMContainerTokenSecretManager(
     RMContainerTokenSecretManager containerTokenSecretManager = new RMContainerTokenSecretManager(
         conf);
         conf);

+ 36 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java

@@ -376,4 +376,40 @@ public class TestUtils {
     CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();
     CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();
     return cs.getSchedulerApplications().get(appId).getCurrentAppAttempt();
     return cs.getSchedulerApplications().get(appId).getCurrentAppAttempt();
   }
   }
+
+  /**
+   * Get a queue structure:
+   * <pre>
+   *             Root
+   *            /  |  \
+   *           a   b   c
+   *          10   20  70
+   * </pre>
+   */
+  public static Configuration
+  getConfigurationWithMultipleQueues(Configuration config) {
+    CapacitySchedulerConfiguration conf =
+        new CapacitySchedulerConfiguration(config);
+
+    // Define top-level queues
+    conf.setQueues(CapacitySchedulerConfiguration.ROOT,
+        new String[] { "a", "b", "c" });
+
+    final String A = CapacitySchedulerConfiguration.ROOT + ".a";
+    conf.setCapacity(A, 10);
+    conf.setMaximumCapacity(A, 100);
+    conf.setUserLimitFactor(A, 100);
+
+    final String B = CapacitySchedulerConfiguration.ROOT + ".b";
+    conf.setCapacity(B, 20);
+    conf.setMaximumCapacity(B, 100);
+    conf.setUserLimitFactor(B, 100);
+
+    final String C = CapacitySchedulerConfiguration.ROOT + ".c";
+    conf.setCapacity(C, 70);
+    conf.setMaximumCapacity(C, 100);
+    conf.setUserLimitFactor(C, 100);
+
+    return conf;
+  }
 }
 }

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java

@@ -4861,7 +4861,7 @@ public class TestFairScheduler extends FairSchedulerTestBase {
 
 
     // Trigger container rescheduled event
     // Trigger container rescheduled event
     scheduler.handle(new ContainerPreemptEvent(appAttemptId, rmContainer,
     scheduler.handle(new ContainerPreemptEvent(appAttemptId, rmContainer,
-      SchedulerEventType.KILL_PREEMPTED_CONTAINER));
+      SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE));
 
 
     List<ResourceRequest> requests = rmContainer.getResourceRequests();
     List<ResourceRequest> requests = rmContainer.getResourceRequests();
     // Once recovered, resource request will be present again in app
     // Once recovered, resource request will be present again in app