|
@@ -158,11 +158,13 @@ public class RMContainerAllocator extends RMContainerRequestor
|
|
|
private boolean reduceStarted = false;
|
|
|
private float maxReduceRampupLimit = 0;
|
|
|
private float maxReducePreemptionLimit = 0;
|
|
|
- /**
|
|
|
- * after this threshold, if the container request is not allocated, it is
|
|
|
- * considered delayed.
|
|
|
- */
|
|
|
- private long allocationDelayThresholdMs = 0;
|
|
|
+
|
|
|
+ // Mapper allocation timeout, after which a reducer is forcibly preempted
|
|
|
+ private long reducerUnconditionalPreemptionDelayMs;
|
|
|
+
|
|
|
+ // Duration to wait before preempting a reducer when there is NO room
|
|
|
+ private long reducerNoHeadroomPreemptionDelayMs = 0;
|
|
|
+
|
|
|
private float reduceSlowStart = 0;
|
|
|
private int maxRunningMaps = 0;
|
|
|
private int maxRunningReduces = 0;
|
|
@@ -194,7 +196,10 @@ public class RMContainerAllocator extends RMContainerRequestor
|
|
|
maxReducePreemptionLimit = conf.getFloat(
|
|
|
MRJobConfig.MR_AM_JOB_REDUCE_PREEMPTION_LIMIT,
|
|
|
MRJobConfig.DEFAULT_MR_AM_JOB_REDUCE_PREEMPTION_LIMIT);
|
|
|
- allocationDelayThresholdMs = conf.getInt(
|
|
|
+ reducerUnconditionalPreemptionDelayMs = 1000 * conf.getInt(
|
|
|
+ MRJobConfig.MR_JOB_REDUCER_UNCONDITIONAL_PREEMPT_DELAY_SEC,
|
|
|
+ MRJobConfig.DEFAULT_MR_JOB_REDUCER_UNCONDITIONAL_PREEMPT_DELAY_SEC);
|
|
|
+ reducerNoHeadroomPreemptionDelayMs = conf.getInt(
|
|
|
MRJobConfig.MR_JOB_REDUCER_PREEMPT_DELAY_SEC,
|
|
|
MRJobConfig.DEFAULT_MR_JOB_REDUCER_PREEMPT_DELAY_SEC) * 1000;//sec -> ms
|
|
|
maxRunningMaps = conf.getInt(MRJobConfig.JOB_RUNNING_MAP_LIMIT,
|
|
@@ -454,59 +459,89 @@ public class RMContainerAllocator extends RMContainerRequestor
|
|
|
if (reduceResourceRequest.equals(Resources.none())) {
|
|
|
return; // no reduces
|
|
|
}
|
|
|
- //check if reduces have taken over the whole cluster and there are
|
|
|
- //unassigned maps
|
|
|
- if (scheduledRequests.maps.size() > 0) {
|
|
|
- Resource resourceLimit = getResourceLimit();
|
|
|
- Resource availableResourceForMap =
|
|
|
- Resources.subtract(
|
|
|
- resourceLimit,
|
|
|
- Resources.multiply(reduceResourceRequest,
|
|
|
- assignedRequests.reduces.size()
|
|
|
- - assignedRequests.preemptionWaitingReduces.size()));
|
|
|
- // availableMemForMap must be sufficient to run at least 1 map
|
|
|
- if (ResourceCalculatorUtils.computeAvailableContainers(availableResourceForMap,
|
|
|
- mapResourceRequest, getSchedulerResourceTypes()) <= 0) {
|
|
|
- // to make sure new containers are given to maps and not reduces
|
|
|
- // ramp down all scheduled reduces if any
|
|
|
- // (since reduces are scheduled at higher priority than maps)
|
|
|
- LOG.info("Ramping down all scheduled reduces:"
|
|
|
- + scheduledRequests.reduces.size());
|
|
|
- for (ContainerRequest req : scheduledRequests.reduces.values()) {
|
|
|
- pendingReduces.add(req);
|
|
|
- }
|
|
|
- scheduledRequests.reduces.clear();
|
|
|
-
|
|
|
- //do further checking to find the number of map requests that were
|
|
|
- //hanging around for a while
|
|
|
- int hangingMapRequests = getNumOfHangingRequests(scheduledRequests.maps);
|
|
|
- if (hangingMapRequests > 0) {
|
|
|
- // preempt for making space for at least one map
|
|
|
- int preemptionReduceNumForOneMap =
|
|
|
- ResourceCalculatorUtils.divideAndCeilContainers(mapResourceRequest,
|
|
|
- reduceResourceRequest, getSchedulerResourceTypes());
|
|
|
- int preemptionReduceNumForPreemptionLimit =
|
|
|
- ResourceCalculatorUtils.divideAndCeilContainers(
|
|
|
- Resources.multiply(resourceLimit, maxReducePreemptionLimit),
|
|
|
- reduceResourceRequest, getSchedulerResourceTypes());
|
|
|
- int preemptionReduceNumForAllMaps =
|
|
|
- ResourceCalculatorUtils.divideAndCeilContainers(
|
|
|
- Resources.multiply(mapResourceRequest, hangingMapRequests),
|
|
|
- reduceResourceRequest, getSchedulerResourceTypes());
|
|
|
- int toPreempt =
|
|
|
- Math.min(Math.max(preemptionReduceNumForOneMap,
|
|
|
- preemptionReduceNumForPreemptionLimit),
|
|
|
- preemptionReduceNumForAllMaps);
|
|
|
|
|
|
- LOG.info("Going to preempt " + toPreempt
|
|
|
- + " due to lack of space for maps");
|
|
|
- assignedRequests.preemptReduce(toPreempt);
|
|
|
- }
|
|
|
+ if (assignedRequests.maps.size() > 0) {
|
|
|
+ // there are assigned mappers
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (scheduledRequests.maps.size() <= 0) {
|
|
|
+ // there are no pending requests for mappers
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ // At this point:
|
|
|
+ // we have pending mappers and all assigned resources are taken by reducers
|
|
|
+
|
|
|
+ if (reducerUnconditionalPreemptionDelayMs >= 0) {
|
|
|
+ // Unconditional preemption is enabled.
|
|
|
+ // If mappers are pending for longer than the configured threshold,
|
|
|
+ // preempt reducers irrespective of what the headroom is.
|
|
|
+ if (preemptReducersForHangingMapRequests(
|
|
|
+ reducerUnconditionalPreemptionDelayMs)) {
|
|
|
+ return;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ // The pending mappers haven't been waiting for too long. Let us see if
|
|
|
+ // the headroom can fit a mapper.
|
|
|
+ Resource availableResourceForMap = getAvailableResources();
|
|
|
+ if (ResourceCalculatorUtils.computeAvailableContainers(availableResourceForMap,
|
|
|
+ mapResourceRequest, getSchedulerResourceTypes()) > 0) {
|
|
|
+ // the available headroom is enough to run a mapper
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Available headroom is not enough to run mapper. See if we should hold
|
|
|
+ // off before preempting reducers and preempt if okay.
|
|
|
+ preemptReducersForHangingMapRequests(reducerNoHeadroomPreemptionDelayMs);
|
|
|
+ }
|
|
|
+
|
|
|
+ private boolean preemptReducersForHangingMapRequests(long pendingThreshold) {
|
|
|
+ int hangingMapRequests = getNumHangingRequests(
|
|
|
+ pendingThreshold, scheduledRequests.maps);
|
|
|
+ if (hangingMapRequests > 0) {
|
|
|
+ preemptReducer(hangingMapRequests);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void clearAllPendingReduceRequests() {
|
|
|
+ LOG.info("Ramping down all scheduled reduces:"
|
|
|
+ + scheduledRequests.reduces.size());
|
|
|
+ for (ContainerRequest req : scheduledRequests.reduces.values()) {
|
|
|
+ pendingReduces.add(req);
|
|
|
+ }
|
|
|
+ scheduledRequests.reduces.clear();
|
|
|
}
|
|
|
-
|
|
|
- private int getNumOfHangingRequests(Map<TaskAttemptId, ContainerRequest> requestMap) {
|
|
|
+
|
|
|
+ private void preemptReducer(int hangingMapRequests) {
|
|
|
+ clearAllPendingReduceRequests();
|
|
|
+
|
|
|
+ // preempt for making space for at least one map
|
|
|
+ int preemptionReduceNumForOneMap =
|
|
|
+ ResourceCalculatorUtils.divideAndCeilContainers(mapResourceRequest,
|
|
|
+ reduceResourceRequest, getSchedulerResourceTypes());
|
|
|
+ int preemptionReduceNumForPreemptionLimit =
|
|
|
+ ResourceCalculatorUtils.divideAndCeilContainers(
|
|
|
+ Resources.multiply(getResourceLimit(), maxReducePreemptionLimit),
|
|
|
+ reduceResourceRequest, getSchedulerResourceTypes());
|
|
|
+ int preemptionReduceNumForAllMaps =
|
|
|
+ ResourceCalculatorUtils.divideAndCeilContainers(
|
|
|
+ Resources.multiply(mapResourceRequest, hangingMapRequests),
|
|
|
+ reduceResourceRequest, getSchedulerResourceTypes());
|
|
|
+ int toPreempt =
|
|
|
+ Math.min(Math.max(preemptionReduceNumForOneMap,
|
|
|
+ preemptionReduceNumForPreemptionLimit),
|
|
|
+ preemptionReduceNumForAllMaps);
|
|
|
+
|
|
|
+ LOG.info("Going to preempt " + toPreempt
|
|
|
+ + " due to lack of space for maps");
|
|
|
+ assignedRequests.preemptReduce(toPreempt);
|
|
|
+ }
|
|
|
+
|
|
|
+ private int getNumHangingRequests(long allocationDelayThresholdMs,
|
|
|
+ Map<TaskAttemptId, ContainerRequest> requestMap) {
|
|
|
if (allocationDelayThresholdMs <= 0)
|
|
|
return requestMap.size();
|
|
|
int hangingRequests = 0;
|
|
@@ -534,9 +569,6 @@ public class RMContainerAllocator extends RMContainerRequestor
|
|
|
|
|
|
// get available resources for this job
|
|
|
Resource headRoom = getAvailableResources();
|
|
|
- if (headRoom == null) {
|
|
|
- headRoom = Resources.none();
|
|
|
- }
|
|
|
|
|
|
LOG.info("Recalculating schedule, headroom=" + headRoom);
|
|
|
|
|
@@ -663,9 +695,7 @@ public class RMContainerAllocator extends RMContainerRequestor
|
|
|
applyConcurrentTaskLimits();
|
|
|
|
|
|
// will be null the first time
|
|
|
- Resource headRoom =
|
|
|
- getAvailableResources() == null ? Resources.none() :
|
|
|
- Resources.clone(getAvailableResources());
|
|
|
+ Resource headRoom = Resources.clone(getAvailableResources());
|
|
|
AllocateResponse response;
|
|
|
/*
|
|
|
* If contact with RM is lost, the AM will wait MR_AM_TO_RM_WAIT_INTERVAL_MS
|
|
@@ -706,9 +736,7 @@ public class RMContainerAllocator extends RMContainerRequestor
|
|
|
// continue to attempt to contact the RM.
|
|
|
throw e;
|
|
|
}
|
|
|
- Resource newHeadRoom =
|
|
|
- getAvailableResources() == null ? Resources.none()
|
|
|
- : getAvailableResources();
|
|
|
+ Resource newHeadRoom = getAvailableResources();
|
|
|
List<Container> newContainers = response.getAllocatedContainers();
|
|
|
// Setting NMTokens
|
|
|
if (response.getNMTokens() != null) {
|
|
@@ -868,9 +896,6 @@ public class RMContainerAllocator extends RMContainerRequestor
|
|
|
@Private
|
|
|
public Resource getResourceLimit() {
|
|
|
Resource headRoom = getAvailableResources();
|
|
|
- if (headRoom == null) {
|
|
|
- headRoom = Resources.none();
|
|
|
- }
|
|
|
Resource assignedMapResource =
|
|
|
Resources.multiply(mapResourceRequest, assignedRequests.maps.size());
|
|
|
Resource assignedReduceResource =
|