|
@@ -19,10 +19,12 @@
|
|
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
|
|
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
|
|
|
|
|
|
import java.io.Serializable;
|
|
import java.io.Serializable;
|
|
|
|
+import java.text.DecimalFormat;
|
|
import java.util.Arrays;
|
|
import java.util.Arrays;
|
|
import java.util.Collection;
|
|
import java.util.Collection;
|
|
import java.util.Comparator;
|
|
import java.util.Comparator;
|
|
import java.util.HashMap;
|
|
import java.util.HashMap;
|
|
|
|
+import java.util.HashSet;
|
|
import java.util.List;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.Set;
|
|
@@ -52,6 +54,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManage
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
|
|
|
|
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
|
|
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
|
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
|
import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
|
|
import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
|
|
import org.apache.hadoop.yarn.util.resource.Resources;
|
|
import org.apache.hadoop.yarn.util.resource.Resources;
|
|
@@ -78,6 +81,10 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
|
|
private RMContainerComparator comparator = new RMContainerComparator();
|
|
private RMContainerComparator comparator = new RMContainerComparator();
|
|
private final Map<RMContainer, Long> preemptionMap = new HashMap<RMContainer, Long>();
|
|
private final Map<RMContainer, Long> preemptionMap = new HashMap<RMContainer, Long>();
|
|
|
|
|
|
|
|
+ // Used to record node reservation by an app.
|
|
|
|
+ // Key = RackName, Value = Set of Nodes reserved by app on rack
|
|
|
|
+ private Map<String, Set<String>> reservations = new HashMap<>();
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* Delay scheduling: We often want to prioritize scheduling of node-local
|
|
* Delay scheduling: We often want to prioritize scheduling of node-local
|
|
* containers over rack-local or off-switch containers. To achieve this
|
|
* containers over rack-local or off-switch containers. To achieve this
|
|
@@ -446,22 +453,53 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
|
|
* in {@link FSSchedulerNode}..
|
|
* in {@link FSSchedulerNode}..
|
|
*/
|
|
*/
|
|
private void reserve(Priority priority, FSSchedulerNode node,
|
|
private void reserve(Priority priority, FSSchedulerNode node,
|
|
- Container container, boolean alreadyReserved) {
|
|
|
|
- LOG.info("Making reservation: node=" + node.getNodeName() +
|
|
|
|
- " app_id=" + getApplicationId());
|
|
|
|
-
|
|
|
|
- if (!alreadyReserved) {
|
|
|
|
- getMetrics().reserveResource(getUser(), container.getResource());
|
|
|
|
- RMContainer rmContainer =
|
|
|
|
- super.reserve(node, priority, null, container);
|
|
|
|
- node.reserveResource(this, priority, rmContainer);
|
|
|
|
- } else {
|
|
|
|
- RMContainer rmContainer = node.getReservedContainer();
|
|
|
|
- super.reserve(node, priority, rmContainer, container);
|
|
|
|
- node.reserveResource(this, priority, rmContainer);
|
|
|
|
|
|
+ Container container, NodeType type, boolean alreadyReserved) {
|
|
|
|
+
|
|
|
|
+ if (!reservationExceedsThreshold(node, type)) {
|
|
|
|
+ LOG.info("Making reservation: node=" + node.getNodeName() +
|
|
|
|
+ " app_id=" + getApplicationId());
|
|
|
|
+ if (!alreadyReserved) {
|
|
|
|
+ getMetrics().reserveResource(getUser(), container.getResource());
|
|
|
|
+ RMContainer rmContainer =
|
|
|
|
+ super.reserve(node, priority, null, container);
|
|
|
|
+ node.reserveResource(this, priority, rmContainer);
|
|
|
|
+ setReservation(node);
|
|
|
|
+ } else {
|
|
|
|
+ RMContainer rmContainer = node.getReservedContainer();
|
|
|
|
+ super.reserve(node, priority, rmContainer, container);
|
|
|
|
+ node.reserveResource(this, priority, rmContainer);
|
|
|
|
+ setReservation(node);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ private boolean reservationExceedsThreshold(FSSchedulerNode node,
|
|
|
|
+ NodeType type) {
|
|
|
|
+ // Only if not node-local
|
|
|
|
+ if (type != NodeType.NODE_LOCAL) {
|
|
|
|
+ int existingReservations = getNumReservations(node.getRackName(),
|
|
|
|
+ type == NodeType.OFF_SWITCH);
|
|
|
|
+ int totalAvailNodes =
|
|
|
|
+ (type == NodeType.OFF_SWITCH) ? scheduler.getNumClusterNodes() :
|
|
|
|
+ scheduler.getNumNodesInRack(node.getRackName());
|
|
|
|
+ int numAllowedReservations =
|
|
|
|
+ (int)Math.ceil(
|
|
|
|
+ totalAvailNodes * scheduler.getReservableNodesRatio());
|
|
|
|
+ if (existingReservations >= numAllowedReservations) {
|
|
|
|
+ DecimalFormat df = new DecimalFormat();
|
|
|
|
+ df.setMaximumFractionDigits(2);
|
|
|
|
+ LOG.info("Reservation Exceeds Allowed number of nodes:" +
|
|
|
|
+ " app_id=" + getApplicationId() +
|
|
|
|
+ " existingReservations=" + existingReservations +
|
|
|
|
+ " totalAvailableNodes=" + totalAvailNodes +
|
|
|
|
+ " reservableNodesRatio=" + df.format(
|
|
|
|
+ scheduler.getReservableNodesRatio()) +
|
|
|
|
+ " numAllowedReservations=" + numAllowedReservations);
|
|
|
|
+ return true;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return false;
|
|
|
|
+ }
|
|
/**
|
|
/**
|
|
* Remove the reservation on {@code node} at the given {@link Priority}.
|
|
* Remove the reservation on {@code node} at the given {@link Priority}.
|
|
* This dispatches SchedulerNode handlers as well.
|
|
* This dispatches SchedulerNode handlers as well.
|
|
@@ -470,10 +508,47 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
|
|
RMContainer rmContainer = node.getReservedContainer();
|
|
RMContainer rmContainer = node.getReservedContainer();
|
|
unreserveInternal(priority, node);
|
|
unreserveInternal(priority, node);
|
|
node.unreserveResource(this);
|
|
node.unreserveResource(this);
|
|
|
|
+ clearReservation(node);
|
|
getMetrics().unreserveResource(
|
|
getMetrics().unreserveResource(
|
|
getUser(), rmContainer.getContainer().getResource());
|
|
getUser(), rmContainer.getContainer().getResource());
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ private synchronized void setReservation(SchedulerNode node) {
|
|
|
|
+ String rackName = node.getRackName() == null ? "NULL" : node.getRackName();
|
|
|
|
+ Set<String> rackReservations = reservations.get(rackName);
|
|
|
|
+ if (rackReservations == null) {
|
|
|
|
+ rackReservations = new HashSet<>();
|
|
|
|
+ reservations.put(rackName, rackReservations);
|
|
|
|
+ }
|
|
|
|
+ rackReservations.add(node.getNodeName());
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private synchronized void clearReservation(SchedulerNode node) {
|
|
|
|
+ String rackName = node.getRackName() == null ? "NULL" : node.getRackName();
|
|
|
|
+ Set<String> rackReservations = reservations.get(rackName);
|
|
|
|
+ if (rackReservations != null) {
|
|
|
|
+ rackReservations.remove(node.getNodeName());
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ int getNumReservations(String rackName, boolean isAny) {
|
|
|
|
+ int counter = 0;
|
|
|
|
+ if (isAny) {
|
|
|
|
+ for (Set<String> nodes : reservations.values()) {
|
|
|
|
+ if (nodes != null) {
|
|
|
|
+ counter += nodes.size();
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ Set<String> nodes = reservations.get(
|
|
|
|
+ rackName == null ? "NULL" : rackName);
|
|
|
|
+ if (nodes != null) {
|
|
|
|
+ counter += nodes.size();
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return counter;
|
|
|
|
+ }
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* Assign a container to this node to facilitate {@code request}. If node does
|
|
* Assign a container to this node to facilitate {@code request}. If node does
|
|
* not have enough memory, create a reservation. This is called once we are
|
|
* not have enough memory, create a reservation. This is called once we are
|
|
@@ -545,7 +620,7 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
|
|
|
|
|
|
if (isReservable(container)) {
|
|
if (isReservable(container)) {
|
|
// The desired container won't fit here, so reserve
|
|
// The desired container won't fit here, so reserve
|
|
- reserve(request.getPriority(), node, container, reserved);
|
|
|
|
|
|
+ reserve(request.getPriority(), node, container, type, reserved);
|
|
|
|
|
|
return FairScheduler.CONTAINER_RESERVED;
|
|
return FairScheduler.CONTAINER_RESERVED;
|
|
} else {
|
|
} else {
|