|
@@ -215,6 +215,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
DIAGNOSTIC_UPDATE_TRANSITION = new DiagnosticsUpdateTransition();
|
|
|
private static final InternalErrorTransition
|
|
|
INTERNAL_ERROR_TRANSITION = new InternalErrorTransition();
|
|
|
+ private static final InternalRebootTransition
|
|
|
+ INTERNAL_REBOOT_TRANSITION = new InternalRebootTransition();
|
|
|
private static final TaskAttemptCompletedEventTransition
|
|
|
TASK_ATTEMPT_COMPLETED_EVENT_TRANSITION =
|
|
|
new TaskAttemptCompletedEventTransition();
|
|
@@ -246,6 +248,9 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
.addTransition(JobStateInternal.NEW, JobStateInternal.ERROR,
|
|
|
JobEventType.INTERNAL_ERROR,
|
|
|
INTERNAL_ERROR_TRANSITION)
|
|
|
+ .addTransition(JobStateInternal.NEW, JobStateInternal.REBOOT,
|
|
|
+ JobEventType.JOB_AM_REBOOT,
|
|
|
+ INTERNAL_REBOOT_TRANSITION)
|
|
|
// Ignore-able events
|
|
|
.addTransition(JobStateInternal.NEW, JobStateInternal.NEW,
|
|
|
JobEventType.JOB_UPDATED_NODES)
|
|
@@ -265,6 +270,9 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
.addTransition(JobStateInternal.INITED, JobStateInternal.ERROR,
|
|
|
JobEventType.INTERNAL_ERROR,
|
|
|
INTERNAL_ERROR_TRANSITION)
|
|
|
+ .addTransition(JobStateInternal.INITED, JobStateInternal.REBOOT,
|
|
|
+ JobEventType.JOB_AM_REBOOT,
|
|
|
+ INTERNAL_REBOOT_TRANSITION)
|
|
|
// Ignore-able events
|
|
|
.addTransition(JobStateInternal.INITED, JobStateInternal.INITED,
|
|
|
JobEventType.JOB_UPDATED_NODES)
|
|
@@ -287,6 +295,9 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
.addTransition(JobStateInternal.SETUP, JobStateInternal.ERROR,
|
|
|
JobEventType.INTERNAL_ERROR,
|
|
|
INTERNAL_ERROR_TRANSITION)
|
|
|
+ .addTransition(JobStateInternal.SETUP, JobStateInternal.REBOOT,
|
|
|
+ JobEventType.JOB_AM_REBOOT,
|
|
|
+ INTERNAL_REBOOT_TRANSITION)
|
|
|
// Ignore-able events
|
|
|
.addTransition(JobStateInternal.SETUP, JobStateInternal.SETUP,
|
|
|
JobEventType.JOB_UPDATED_NODES)
|
|
@@ -327,6 +338,9 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
JobStateInternal.RUNNING,
|
|
|
JobStateInternal.ERROR, JobEventType.INTERNAL_ERROR,
|
|
|
INTERNAL_ERROR_TRANSITION)
|
|
|
+ .addTransition(JobStateInternal.RUNNING, JobStateInternal.REBOOT,
|
|
|
+ JobEventType.JOB_AM_REBOOT,
|
|
|
+ INTERNAL_REBOOT_TRANSITION)
|
|
|
|
|
|
// Transitions from KILL_WAIT state.
|
|
|
.addTransition
|
|
@@ -352,7 +366,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
EnumSet.of(JobEventType.JOB_KILL,
|
|
|
JobEventType.JOB_UPDATED_NODES,
|
|
|
JobEventType.JOB_MAP_TASK_RESCHEDULED,
|
|
|
- JobEventType.JOB_TASK_ATTEMPT_FETCH_FAILURE))
|
|
|
+ JobEventType.JOB_TASK_ATTEMPT_FETCH_FAILURE,
|
|
|
+ JobEventType.JOB_AM_REBOOT))
|
|
|
|
|
|
// Transitions from COMMITTING state
|
|
|
.addTransition(JobStateInternal.COMMITTING,
|
|
@@ -377,7 +392,10 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
.addTransition(JobStateInternal.COMMITTING,
|
|
|
JobStateInternal.ERROR, JobEventType.INTERNAL_ERROR,
|
|
|
INTERNAL_ERROR_TRANSITION)
|
|
|
- // Ignore-able events
|
|
|
+ .addTransition(JobStateInternal.COMMITTING, JobStateInternal.REBOOT,
|
|
|
+ JobEventType.JOB_AM_REBOOT,
|
|
|
+ INTERNAL_REBOOT_TRANSITION)
|
|
|
+ // Ignore-able events
|
|
|
.addTransition(JobStateInternal.COMMITTING,
|
|
|
JobStateInternal.COMMITTING,
|
|
|
EnumSet.of(JobEventType.JOB_UPDATED_NODES,
|
|
@@ -397,7 +415,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
.addTransition(JobStateInternal.SUCCEEDED, JobStateInternal.SUCCEEDED,
|
|
|
EnumSet.of(JobEventType.JOB_KILL,
|
|
|
JobEventType.JOB_UPDATED_NODES,
|
|
|
- JobEventType.JOB_TASK_ATTEMPT_FETCH_FAILURE))
|
|
|
+ JobEventType.JOB_TASK_ATTEMPT_FETCH_FAILURE,
|
|
|
+ JobEventType.JOB_AM_REBOOT))
|
|
|
|
|
|
// Transitions from FAIL_ABORT state
|
|
|
.addTransition(JobStateInternal.FAIL_ABORT,
|
|
@@ -425,7 +444,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
JobEventType.JOB_MAP_TASK_RESCHEDULED,
|
|
|
JobEventType.JOB_TASK_ATTEMPT_FETCH_FAILURE,
|
|
|
JobEventType.JOB_COMMIT_COMPLETED,
|
|
|
- JobEventType.JOB_COMMIT_FAILED))
|
|
|
+ JobEventType.JOB_COMMIT_FAILED,
|
|
|
+ JobEventType.JOB_AM_REBOOT))
|
|
|
|
|
|
// Transitions from KILL_ABORT state
|
|
|
.addTransition(JobStateInternal.KILL_ABORT,
|
|
@@ -452,7 +472,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
JobEventType.JOB_SETUP_COMPLETED,
|
|
|
JobEventType.JOB_SETUP_FAILED,
|
|
|
JobEventType.JOB_COMMIT_COMPLETED,
|
|
|
- JobEventType.JOB_COMMIT_FAILED))
|
|
|
+ JobEventType.JOB_COMMIT_FAILED,
|
|
|
+ JobEventType.JOB_AM_REBOOT))
|
|
|
|
|
|
// Transitions from FAILED state
|
|
|
.addTransition(JobStateInternal.FAILED, JobStateInternal.FAILED,
|
|
@@ -476,7 +497,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
JobEventType.JOB_SETUP_FAILED,
|
|
|
JobEventType.JOB_COMMIT_COMPLETED,
|
|
|
JobEventType.JOB_COMMIT_FAILED,
|
|
|
- JobEventType.JOB_ABORT_COMPLETED))
|
|
|
+ JobEventType.JOB_ABORT_COMPLETED,
|
|
|
+ JobEventType.JOB_AM_REBOOT))
|
|
|
|
|
|
// Transitions from KILLED state
|
|
|
.addTransition(JobStateInternal.KILLED, JobStateInternal.KILLED,
|
|
@@ -498,7 +520,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
JobEventType.JOB_SETUP_FAILED,
|
|
|
JobEventType.JOB_COMMIT_COMPLETED,
|
|
|
JobEventType.JOB_COMMIT_FAILED,
|
|
|
- JobEventType.JOB_ABORT_COMPLETED))
|
|
|
+ JobEventType.JOB_ABORT_COMPLETED,
|
|
|
+ JobEventType.JOB_AM_REBOOT))
|
|
|
|
|
|
// No transitions from INTERNAL_ERROR state. Ignore all.
|
|
|
.addTransition(
|
|
@@ -517,9 +540,33 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
JobEventType.JOB_COMMIT_COMPLETED,
|
|
|
JobEventType.JOB_COMMIT_FAILED,
|
|
|
JobEventType.JOB_ABORT_COMPLETED,
|
|
|
- JobEventType.INTERNAL_ERROR))
|
|
|
+ JobEventType.INTERNAL_ERROR,
|
|
|
+ JobEventType.JOB_AM_REBOOT))
|
|
|
.addTransition(JobStateInternal.ERROR, JobStateInternal.ERROR,
|
|
|
JobEventType.JOB_COUNTER_UPDATE, COUNTER_UPDATE_TRANSITION)
|
|
|
+
|
|
|
+ // No transitions from AM_REBOOT state. Ignore all.
|
|
|
+ .addTransition(
|
|
|
+ JobStateInternal.REBOOT,
|
|
|
+ JobStateInternal.REBOOT,
|
|
|
+ EnumSet.of(JobEventType.JOB_INIT,
|
|
|
+ JobEventType.JOB_KILL,
|
|
|
+ JobEventType.JOB_TASK_COMPLETED,
|
|
|
+ JobEventType.JOB_TASK_ATTEMPT_COMPLETED,
|
|
|
+ JobEventType.JOB_MAP_TASK_RESCHEDULED,
|
|
|
+ JobEventType.JOB_DIAGNOSTIC_UPDATE,
|
|
|
+ JobEventType.JOB_UPDATED_NODES,
|
|
|
+ JobEventType.JOB_TASK_ATTEMPT_FETCH_FAILURE,
|
|
|
+ JobEventType.JOB_SETUP_COMPLETED,
|
|
|
+ JobEventType.JOB_SETUP_FAILED,
|
|
|
+ JobEventType.JOB_COMMIT_COMPLETED,
|
|
|
+ JobEventType.JOB_COMMIT_FAILED,
|
|
|
+ JobEventType.JOB_ABORT_COMPLETED,
|
|
|
+ JobEventType.INTERNAL_ERROR,
|
|
|
+ JobEventType.JOB_AM_REBOOT))
|
|
|
+ .addTransition(JobStateInternal.REBOOT, JobStateInternal.REBOOT,
|
|
|
+ JobEventType.JOB_COUNTER_UPDATE, COUNTER_UPDATE_TRANSITION)
|
|
|
+
|
|
|
// create the topology tables
|
|
|
.installTopology();
|
|
|
|
|
@@ -904,6 +951,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
return JobState.RUNNING;
|
|
|
case FAIL_ABORT:
|
|
|
return JobState.FAILED;
|
|
|
+ case REBOOT:
|
|
|
+ return JobState.ERROR;
|
|
|
default:
|
|
|
return JobState.valueOf(smState.name());
|
|
|
}
|
|
@@ -972,6 +1021,7 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
case KILLED:
|
|
|
metrics.killedJob(this);
|
|
|
break;
|
|
|
+ case REBOOT:
|
|
|
case ERROR:
|
|
|
case FAILED:
|
|
|
metrics.failedJob(this);
|
|
@@ -1898,8 +1948,17 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- private static class InternalErrorTransition implements
|
|
|
+ private static class InternalTerminationTransition implements
|
|
|
SingleArcTransition<JobImpl, JobEvent> {
|
|
|
+ JobStateInternal terminationState = null;
|
|
|
+ String jobHistoryString = null;
|
|
|
+ public InternalTerminationTransition(JobStateInternal stateInternal,
|
|
|
+ String jobHistoryString) {
|
|
|
+ this.terminationState = stateInternal;
|
|
|
+ //mostly a hack for jbhistoryserver
|
|
|
+ this.jobHistoryString = jobHistoryString;
|
|
|
+ }
|
|
|
+
|
|
|
@Override
|
|
|
public void transition(JobImpl job, JobEvent event) {
|
|
|
//TODO Is this JH event required.
|
|
@@ -1907,9 +1966,21 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
|
|
JobUnsuccessfulCompletionEvent failedEvent =
|
|
|
new JobUnsuccessfulCompletionEvent(job.oldJobId,
|
|
|
job.finishTime, 0, 0,
|
|
|
- JobStateInternal.ERROR.toString());
|
|
|
+ jobHistoryString);
|
|
|
job.eventHandler.handle(new JobHistoryEvent(job.jobId, failedEvent));
|
|
|
- job.finished(JobStateInternal.ERROR);
|
|
|
+ job.finished(terminationState);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static class InternalErrorTransition extends InternalTerminationTransition {
|
|
|
+ public InternalErrorTransition(){
|
|
|
+ super(JobStateInternal.ERROR, JobStateInternal.ERROR.toString());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static class InternalRebootTransition extends InternalTerminationTransition {
|
|
|
+ public InternalRebootTransition(){
|
|
|
+ super(JobStateInternal.REBOOT, JobStateInternal.ERROR.toString());
|
|
|
}
|
|
|
}
|
|
|
|