|
@@ -71,14 +71,12 @@ public class TaskHeartbeatHandler extends AbstractService {
|
|
|
private Thread lostTaskCheckerThread;
|
|
|
private volatile boolean stopped;
|
|
|
private long taskTimeOut;
|
|
|
- private long unregisterTimeOut;
|
|
|
private int taskTimeOutCheckInterval = 30 * 1000; // 30 seconds.
|
|
|
|
|
|
private final EventHandler eventHandler;
|
|
|
private final Clock clock;
|
|
|
|
|
|
private ConcurrentMap<TaskAttemptId, ReportTime> runningAttempts;
|
|
|
- private ConcurrentMap<TaskAttemptId, ReportTime> recentlyUnregisteredAttempts;
|
|
|
|
|
|
public TaskHeartbeatHandler(EventHandler eventHandler, Clock clock,
|
|
|
int numThreads) {
|
|
@@ -87,8 +85,6 @@ public class TaskHeartbeatHandler extends AbstractService {
|
|
|
this.clock = clock;
|
|
|
runningAttempts =
|
|
|
new ConcurrentHashMap<TaskAttemptId, ReportTime>(16, 0.75f, numThreads);
|
|
|
- recentlyUnregisteredAttempts =
|
|
|
- new ConcurrentHashMap<TaskAttemptId, ReportTime>(16, 0.75f, numThreads);
|
|
|
}
|
|
|
|
|
|
@Override
|
|
@@ -96,8 +92,6 @@ public class TaskHeartbeatHandler extends AbstractService {
|
|
|
super.serviceInit(conf);
|
|
|
taskTimeOut = conf.getLong(
|
|
|
MRJobConfig.TASK_TIMEOUT, MRJobConfig.DEFAULT_TASK_TIMEOUT_MILLIS);
|
|
|
- unregisterTimeOut = conf.getLong(MRJobConfig.TASK_EXIT_TIMEOUT,
|
|
|
- MRJobConfig.TASK_EXIT_TIMEOUT_DEFAULT);
|
|
|
|
|
|
// enforce task timeout is at least twice as long as task report interval
|
|
|
long taskProgressReportIntervalMillis = MRJobConfUtil.
|
|
@@ -146,12 +140,6 @@ public class TaskHeartbeatHandler extends AbstractService {
|
|
|
|
|
|
public void unregister(TaskAttemptId attemptID) {
|
|
|
runningAttempts.remove(attemptID);
|
|
|
- recentlyUnregisteredAttempts.put(attemptID,
|
|
|
- new ReportTime(clock.getTime()));
|
|
|
- }
|
|
|
-
|
|
|
- public boolean hasRecentlyUnregistered(TaskAttemptId attemptID) {
|
|
|
- return recentlyUnregisteredAttempts.containsKey(attemptID);
|
|
|
}
|
|
|
|
|
|
private class PingChecker implements Runnable {
|
|
@@ -159,9 +147,27 @@ public class TaskHeartbeatHandler extends AbstractService {
|
|
|
@Override
|
|
|
public void run() {
|
|
|
while (!stopped && !Thread.currentThread().isInterrupted()) {
|
|
|
+ Iterator<Map.Entry<TaskAttemptId, ReportTime>> iterator =
|
|
|
+ runningAttempts.entrySet().iterator();
|
|
|
+
|
|
|
+ // avoid calculating current time everytime in loop
|
|
|
long currentTime = clock.getTime();
|
|
|
- checkRunning(currentTime);
|
|
|
- checkRecentlyUnregistered(currentTime);
|
|
|
+
|
|
|
+ while (iterator.hasNext()) {
|
|
|
+ Map.Entry<TaskAttemptId, ReportTime> entry = iterator.next();
|
|
|
+ boolean taskTimedOut = (taskTimeOut > 0) &&
|
|
|
+ (currentTime > (entry.getValue().getLastProgress() + taskTimeOut));
|
|
|
+
|
|
|
+ if(taskTimedOut) {
|
|
|
+ // task is lost, remove from the list and raise lost event
|
|
|
+ iterator.remove();
|
|
|
+ eventHandler.handle(new TaskAttemptDiagnosticsUpdateEvent(entry
|
|
|
+ .getKey(), "AttemptID:" + entry.getKey().toString()
|
|
|
+ + " Timed out after " + taskTimeOut / 1000 + " secs"));
|
|
|
+ eventHandler.handle(new TaskAttemptEvent(entry.getKey(),
|
|
|
+ TaskAttemptEventType.TA_TIMED_OUT));
|
|
|
+ }
|
|
|
+ }
|
|
|
try {
|
|
|
Thread.sleep(taskTimeOutCheckInterval);
|
|
|
} catch (InterruptedException e) {
|
|
@@ -170,39 +176,6 @@ public class TaskHeartbeatHandler extends AbstractService {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- private void checkRunning(long currentTime) {
|
|
|
- Iterator<Map.Entry<TaskAttemptId, ReportTime>> iterator =
|
|
|
- runningAttempts.entrySet().iterator();
|
|
|
-
|
|
|
- while (iterator.hasNext()) {
|
|
|
- Map.Entry<TaskAttemptId, ReportTime> entry = iterator.next();
|
|
|
- boolean taskTimedOut = (taskTimeOut > 0) &&
|
|
|
- (currentTime > (entry.getValue().getLastProgress() + taskTimeOut));
|
|
|
-
|
|
|
- if(taskTimedOut) {
|
|
|
- // task is lost, remove from the list and raise lost event
|
|
|
- iterator.remove();
|
|
|
- eventHandler.handle(new TaskAttemptDiagnosticsUpdateEvent(entry
|
|
|
- .getKey(), "AttemptID:" + entry.getKey().toString()
|
|
|
- + " Timed out after " + taskTimeOut / 1000 + " secs"));
|
|
|
- eventHandler.handle(new TaskAttemptEvent(entry.getKey(),
|
|
|
- TaskAttemptEventType.TA_TIMED_OUT));
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- private void checkRecentlyUnregistered(long currentTime) {
|
|
|
- Iterator<ReportTime> iterator =
|
|
|
- recentlyUnregisteredAttempts.values().iterator();
|
|
|
- while (iterator.hasNext()) {
|
|
|
- ReportTime unregisteredTime = iterator.next();
|
|
|
- if (currentTime >
|
|
|
- unregisteredTime.getLastProgress() + unregisterTimeOut) {
|
|
|
- iterator.remove();
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
}
|
|
|
|
|
|
@VisibleForTesting
|