|
@@ -149,7 +149,6 @@ import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
|
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
|
|
import org.apache.hadoop.yarn.security.client.ClientToAMTokenSecretManager;
|
|
|
import org.apache.hadoop.yarn.util.Clock;
|
|
|
-import org.apache.hadoop.yarn.util.ConverterUtils;
|
|
|
import org.apache.hadoop.yarn.util.SystemClock;
|
|
|
import org.apache.log4j.LogManager;
|
|
|
|
|
@@ -1303,44 +1302,77 @@ public class MRAppMaster extends CompositeService {
|
|
|
}
|
|
|
|
|
|
private void processRecovery() throws IOException{
|
|
|
- if (appAttemptID.getAttemptId() == 1) {
|
|
|
- return; // no need to recover on the first attempt
|
|
|
+ boolean attemptRecovery = shouldAttemptRecovery();
|
|
|
+ boolean recoverySucceeded = true;
|
|
|
+ if (attemptRecovery) {
|
|
|
+ LOG.info("Attempting to recover.");
|
|
|
+ try {
|
|
|
+ parsePreviousJobHistory();
|
|
|
+ } catch (IOException e) {
|
|
|
+ LOG.warn("Unable to parse prior job history, aborting recovery", e);
|
|
|
+ recoverySucceeded = false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (!isFirstAttempt() && (!attemptRecovery || !recoverySucceeded)) {
|
|
|
+ amInfos.addAll(readJustAMInfos());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private boolean isFirstAttempt() {
|
|
|
+ return appAttemptID.getAttemptId() == 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Check if the current job attempt should try to recover from previous
|
|
|
+ * job attempts if any.
|
|
|
+ */
|
|
|
+ private boolean shouldAttemptRecovery() throws IOException {
|
|
|
+ if (isFirstAttempt()) {
|
|
|
+ return false; // no need to recover on the first attempt
|
|
|
}
|
|
|
|
|
|
boolean recoveryEnabled = getConfig().getBoolean(
|
|
|
MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE,
|
|
|
MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE_DEFAULT);
|
|
|
+ if (!recoveryEnabled) {
|
|
|
+ LOG.info("Not attempting to recover. Recovery disabled. To enable " +
|
|
|
+ "recovery, set " + MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE);
|
|
|
+ return false;
|
|
|
+ }
|
|
|
|
|
|
boolean recoverySupportedByCommitter = isRecoverySupported();
|
|
|
+ if (!recoverySupportedByCommitter) {
|
|
|
+ LOG.info("Not attempting to recover. Recovery is not supported by " +
|
|
|
+ committer.getClass() + ". Use an OutputCommitter that supports" +
|
|
|
+ " recovery.");
|
|
|
+ return false;
|
|
|
+ }
|
|
|
|
|
|
- // If a shuffle secret was not provided by the job client then this app
|
|
|
- // attempt will generate one. However that disables recovery if there
|
|
|
- // are reducers as the shuffle secret would be app attempt specific.
|
|
|
- int numReduceTasks = getConfig().getInt(MRJobConfig.NUM_REDUCES, 0);
|
|
|
+ int reducerCount = getConfig().getInt(MRJobConfig.NUM_REDUCES, 0);
|
|
|
+
|
|
|
+ // If a shuffle secret was not provided by the job client, one will be
|
|
|
+ // generated in this job attempt. However, that disables recovery if
|
|
|
+ // there are reducers as the shuffle secret would be job attempt specific.
|
|
|
boolean shuffleKeyValidForRecovery =
|
|
|
TokenCache.getShuffleSecretKey(jobCredentials) != null;
|
|
|
+ if (reducerCount > 0 && !shuffleKeyValidForRecovery) {
|
|
|
+ LOG.info("Not attempting to recover. The shuffle key is invalid for " +
|
|
|
+ "recovery.");
|
|
|
+ return false;
|
|
|
+ }
|
|
|
|
|
|
- if (recoveryEnabled && recoverySupportedByCommitter
|
|
|
- && (numReduceTasks <= 0 || shuffleKeyValidForRecovery)) {
|
|
|
- LOG.info("Recovery is enabled. "
|
|
|
- + "Will try to recover from previous life on best effort basis.");
|
|
|
- try {
|
|
|
- parsePreviousJobHistory();
|
|
|
- } catch (IOException e) {
|
|
|
- LOG.warn("Unable to parse prior job history, aborting recovery", e);
|
|
|
- // try to get just the AMInfos
|
|
|
- amInfos.addAll(readJustAMInfos());
|
|
|
- }
|
|
|
- } else {
|
|
|
- LOG.info("Will not try to recover. recoveryEnabled: "
|
|
|
- + recoveryEnabled + " recoverySupportedByCommitter: "
|
|
|
- + recoverySupportedByCommitter + " numReduceTasks: "
|
|
|
- + numReduceTasks + " shuffleKeyValidForRecovery: "
|
|
|
- + shuffleKeyValidForRecovery + " ApplicationAttemptID: "
|
|
|
- + appAttemptID.getAttemptId());
|
|
|
- // Get the amInfos anyways whether recovery is enabled or not
|
|
|
- amInfos.addAll(readJustAMInfos());
|
|
|
+ // If the intermediate data is encrypted, recovering the job requires the
|
|
|
+ // access to the key. Until the encryption key is persisted, we should
|
|
|
+ // avoid attempts to recover.
|
|
|
+ boolean spillEncrypted = CryptoUtils.isEncryptedSpillEnabled(getConfig());
|
|
|
+ if (reducerCount > 0 && spillEncrypted) {
|
|
|
+ LOG.info("Not attempting to recover. Intermediate spill encryption" +
|
|
|
+ " is enabled.");
|
|
|
+ return false;
|
|
|
}
|
|
|
+
|
|
|
+ return true;
|
|
|
}
|
|
|
|
|
|
private static FSDataInputStream getPreviousJobHistoryStream(
|
|
@@ -1440,6 +1472,10 @@ public class MRAppMaster extends CompositeService {
|
|
|
return amInfos;
|
|
|
}
|
|
|
|
|
|
+ public boolean recovered() {
|
|
|
+ return recoveredJobStartTime > 0;
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* This can be overridden to instantiate multiple jobs and create a
|
|
|
* workflow.
|