|
@@ -18,7 +18,6 @@
|
|
|
package org.apache.hadoop.mapreduce.task.reduce;
|
|
|
|
|
|
import java.io.IOException;
|
|
|
-
|
|
|
import java.net.InetAddress;
|
|
|
import java.net.URI;
|
|
|
import java.net.UnknownHostException;
|
|
@@ -101,6 +100,7 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
|
|
|
|
|
private final boolean reportReadErrorImmediately;
|
|
|
private long maxDelay = MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY;
|
|
|
+ private int maxHostFailures;
|
|
|
|
|
|
public ShuffleSchedulerImpl(JobConf job, TaskStatus status,
|
|
|
TaskAttemptID reduceId,
|
|
@@ -132,6 +132,9 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
|
|
|
|
|
this.maxDelay = job.getLong(MRJobConfig.MAX_SHUFFLE_FETCH_RETRY_DELAY,
|
|
|
MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY);
|
|
|
+ this.maxHostFailures = job.getInt(
|
|
|
+ MRJobConfig.MAX_SHUFFLE_FETCH_HOST_FAILURES,
|
|
|
+ MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_HOST_FAILURES);
|
|
|
}
|
|
|
|
|
|
@Override
|
|
@@ -213,9 +216,18 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
|
|
progress.setStatus("copy(" + mapsDone + " of " + totalMaps + " at "
|
|
|
+ mbpsFormat.format(transferRate) + " MB/s)");
|
|
|
}
|
|
|
+
|
|
|
+ public synchronized void hostFailed(String hostname) {
|
|
|
+ if (hostFailures.containsKey(hostname)) {
|
|
|
+ IntWritable x = hostFailures.get(hostname);
|
|
|
+ x.set(x.get() + 1);
|
|
|
+ } else {
|
|
|
+ hostFailures.put(hostname, new IntWritable(1));
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
public synchronized void copyFailed(TaskAttemptID mapId, MapHost host,
|
|
|
- boolean readError, boolean connectExcpt) {
|
|
|
+ boolean readError, boolean connectExcpt) {
|
|
|
host.penalize();
|
|
|
int failures = 1;
|
|
|
if (failureCounts.containsKey(mapId)) {
|
|
@@ -226,12 +238,9 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
|
|
failureCounts.put(mapId, new IntWritable(1));
|
|
|
}
|
|
|
String hostname = host.getHostName();
|
|
|
- if (hostFailures.containsKey(hostname)) {
|
|
|
- IntWritable x = hostFailures.get(hostname);
|
|
|
- x.set(x.get() + 1);
|
|
|
- } else {
|
|
|
- hostFailures.put(hostname, new IntWritable(1));
|
|
|
- }
|
|
|
+ //report failure if already retried maxHostFailures times
|
|
|
+ boolean hostFail = hostFailures.get(hostname).get() > getMaxHostFailures() ? true : false;
|
|
|
+
|
|
|
if (failures >= abortFailureLimit) {
|
|
|
try {
|
|
|
throw new IOException(failures + " failures downloading " + mapId);
|
|
@@ -240,7 +249,7 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- checkAndInformJobTracker(failures, mapId, readError, connectExcpt);
|
|
|
+ checkAndInformJobTracker(failures, mapId, readError, connectExcpt, hostFail);
|
|
|
|
|
|
checkReducerHealth();
|
|
|
|
|
@@ -270,9 +279,9 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
|
|
// after every 'maxFetchFailuresBeforeReporting' failures
|
|
|
private void checkAndInformJobTracker(
|
|
|
int failures, TaskAttemptID mapId, boolean readError,
|
|
|
- boolean connectExcpt) {
|
|
|
+ boolean connectExcpt, boolean hostFailed) {
|
|
|
if (connectExcpt || (reportReadErrorImmediately && readError)
|
|
|
- || ((failures % maxFetchFailuresBeforeReporting) == 0)) {
|
|
|
+ || ((failures % maxFetchFailuresBeforeReporting) == 0) || hostFailed) {
|
|
|
LOG.info("Reporting fetch failure for " + mapId + " to jobtracker.");
|
|
|
status.addFetchFailedMap((org.apache.hadoop.mapred.TaskAttemptID) mapId);
|
|
|
}
|
|
@@ -507,4 +516,7 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
|
|
referee.join();
|
|
|
}
|
|
|
|
|
|
+ public int getMaxHostFailures() {
|
|
|
+ return maxHostFailures;
|
|
|
+ }
|
|
|
}
|