3 年前 · 7328c34ba5
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
@@ -103,9 +103,14 @@
 
															       <plugin>
														
 
															         <groupId>org.apache.maven.plugins</groupId>
														
 
															         <artifactId>maven-surefire-plugin</artifactId>
														
 
															-	<configuration>
														
 
															+        <configuration>
														
 
															           <systemPropertyVariables>
														
 
															             <log4j.configuration>file:///${project.basedir}/src/test/resources/log4j.properties</log4j.configuration>
														
 
															+
														
 
															+            <!-- project directory   (target/) on a maven build. -->
														
 
															+            <project.build.directory>${project.build.directory}</project.build.directory>
														
 
															+            <!-- this property is used in hadoop-azure with separate paths for parallel runs -->
														
 
															+            <test.build.dir>${project.build.directory}</test.build.dir>
														
 
															           </systemPropertyVariables>
														
 
															         </configuration>
														
 
															       </plugin>
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/PathOutputCommitterFactory.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/PathOutputCommitterFactory.java
@@ -52,8 +52,8 @@ import org.apache.hadoop.util.ReflectionUtils;
 
															  * attempt context and a possibly null path.
														
 
															  *
														
 
															  */
														
 
															-@InterfaceAudience.Private
														
 
															-@InterfaceStability.Unstable
														
 
															+@InterfaceAudience.Public
														
 
															+@InterfaceStability.Evolving
														
 
															 public class PathOutputCommitterFactory extends Configured {
														
 
															   private static final Logger LOG =
														
 
															       LoggerFactory.getLogger(PathOutputCommitterFactory.class);
														
@@ -158,7 +158,7 @@ public class PathOutputCommitterFactory extends Configured {
 
															           scheme);
														
 
															       if (StringUtils.isNotEmpty(conf.getTrimmed(schemeKey))) {
														
 
															         // it does, so use that key in the classname lookup
														
 
															-        LOG.debug("Using schema-specific factory for {}", outputPath);
														
 
															+        LOG.info("Using schema-specific factory for {}", outputPath);
														
 
															         key = schemeKey;
														
 
															       } else {
														
 
															         LOG.debug("No scheme-specific factory defined in {}", schemeKey);
														
@@ -171,7 +171,7 @@ public class PathOutputCommitterFactory extends Configured {
 
															     String trimmedValue = conf.getTrimmed(key, "");
														
 
															     if (StringUtils.isEmpty(trimmedValue)) {
														
 
															       // empty/null value, use default
														
 
															-      LOG.debug("No output committer factory defined,"
														
 
															+      LOG.info("No output committer factory defined,"
														
 
															           + " defaulting to FileOutputCommitterFactory");
														
 
															       factory = FileOutputCommitterFactory.class;
														
 
															     } else {
														
@@ -179,7 +179,7 @@ public class PathOutputCommitterFactory extends Configured {
 
															       factory = conf.getClass(key,
														
 
															           FileOutputCommitterFactory.class,
														
 
															           PathOutputCommitterFactory.class);
														
 
															-      LOG.debug("Using OutputCommitter factory class {} from key {}",
														
 
															+      LOG.info("Using OutputCommitter factory class {} from key {}",
														
 
															           factory, key);
														
 
															     }
														
 
															     return ReflectionUtils.newInstance(factory, conf);
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitter.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitter.java
@@ -0,0 +1,761 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.FileNotFoundException;
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.classification.VisibleForTesting;
														
 
															+import org.apache.hadoop.conf.Configuration;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.mapreduce.JobContext;
														
 
															+import org.apache.hadoop.mapreduce.JobStatus;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptContext;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperationsThroughFileSystem;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.AbortTaskStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitTaskStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageEventCallbacks;
														
 
															+import org.apache.hadoop.util.functional.CloseableTaskPoolSubmitter;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.logIOStatisticsAtDebug;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_DIAGNOSTICS_MANIFEST_DIR;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_SUMMARY_REPORT_DIR;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASKS_COMPLETED_COUNT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASKS_FAILED_COUNT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME_RECOVERED;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_ABORT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.STAGE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration.updateCommonContextOnCommitterExit;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration.updateCommonContextOnCommitterEntry;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createIOStatisticsStore;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createJobSummaryFilename;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createManifestOutcome;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestPathForTask;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage.cleanupStageOptionsFromConfig;
														
 
															+
														
 
															+/**
														
 
															+ * This is the Intermediate-Manifest committer.
														
 
															+ * At every entry point it updates the thread's audit context with
														
 
															+ * the current stage info; this is a placeholder for
														
 
															+ * adding audit information to stores other than S3A.
														
 
															+ *
														
 
															+ * This is tagged as public/stable. This is mandatory
														
 
															+ * for the classname and PathOutputCommitter implementation
														
 
															+ * classes.
														
 
															+ */
														
 
															+@InterfaceAudience.Public
														
 
															+@InterfaceStability.Stable
														
 
															+public class ManifestCommitter extends PathOutputCommitter implements
														
 
															+    IOStatisticsSource, StageEventCallbacks {
														
 
															+
														
 
															+  public static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      ManifestCommitter.class);
														
 
															+
														
 
															+  /**
														
 
															+   * Role: task committer.
														
 
															+   */
														
 
															+  public static final String TASK_COMMITTER = "task committer";
														
 
															+
														
 
															+  /**
														
 
															+   * Role: job committer.
														
 
															+   */
														
 
															+  public static final String JOB_COMMITTER = "job committer";
														
 
															+
														
 
															+  /**
														
 
															+   * Committer Configuration as extracted from
														
 
															+   * the job/task context and set in the constructor.
														
 
															+   */
														
 
															+  private final ManifestCommitterConfig baseConfig;
														
 
															+
														
 
															+  /**
														
 
															+   * Destination of the job.
														
 
															+   */
														
 
															+  private final Path destinationDir;
														
 
															+
														
 
															+  /**
														
 
															+   * For tasks, the attempt directory.
														
 
															+   * Null for jobs.
														
 
															+   */
														
 
															+  private final Path taskAttemptDir;
														
 
															+
														
 
															+  /**
														
 
															+   * IOStatistics to update.
														
 
															+   */
														
 
															+  private final IOStatisticsStore iostatistics;
														
 
															+
														
 
															+  /**
														
 
															+   *  The job Manifest Success data; only valid after a job successfully
														
 
															+   *  commits.
														
 
															+   */
														
 
															+  private ManifestSuccessData successReport;
														
 
															+
														
 
															+  /**
														
 
															+   * The active stage; is updated by a callback from within the stages.
														
 
															+   */
														
 
															+  private String activeStage;
														
 
															+
														
 
															+  /**
														
 
															+   * The task manifest of the task commit.
														
 
															+   * Null unless this is a task attempt and the
														
 
															+   * task has successfully been committed.
														
 
															+   */
														
 
															+  private TaskManifest taskAttemptCommittedManifest;
														
 
															+
														
 
															+  /**
														
 
															+   * Create a committer.
														
 
															+   * @param outputPath output path
														
 
															+   * @param context job/task context
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  public ManifestCommitter(final Path outputPath,
														
 
															+      final TaskAttemptContext context) throws IOException {
														
 
															+    super(outputPath, context);
														
 
															+    this.destinationDir = resolveDestinationDirectory(outputPath,
														
 
															+        context.getConfiguration());
														
 
															+    this.iostatistics = createIOStatisticsStore().build();
														
 
															+    this.baseConfig = enterCommitter(
														
 
															+        context.getTaskAttemptID() != null,
														
 
															+        context);
														
 
															+
														
 
															+    this.taskAttemptDir = baseConfig.getTaskAttemptDir();
														
 
															+    LOG.info("Created ManifestCommitter with JobID {},"
														
 
															+            + " Task Attempt {} and destination {}",
														
 
															+        context.getJobID(), context.getTaskAttemptID(), outputPath);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Committer method invoked; generates a config for it.
														
 
															+   * Calls {@code #updateCommonContextOnCommitterEntry()}
														
 
															+   * to update the audit context.
														
 
															+   * @param isTask is this a task entry point?
														
 
															+   * @param context context
														
 
															+   * @return committer config
														
 
															+   */
														
 
															+  private ManifestCommitterConfig enterCommitter(boolean isTask,
														
 
															+      JobContext context) {
														
 
															+    ManifestCommitterConfig committerConfig =
														
 
															+        new ManifestCommitterConfig(
														
 
															+            getOutputPath(),
														
 
															+            isTask ? TASK_COMMITTER : JOB_COMMITTER,
														
 
															+            context,
														
 
															+            iostatistics,
														
 
															+            this);
														
 
															+    updateCommonContextOnCommitterEntry(committerConfig);
														
 
															+    return committerConfig;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set up a job through a {@link SetupJobStage}.
														
 
															+   * @param jobContext Context of the job whose output is being written.
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void setupJob(final JobContext jobContext) throws IOException {
														
 
															+    ManifestCommitterConfig committerConfig = enterCommitter(false,
														
 
															+        jobContext);
														
 
															+    StageConfig stageConfig =
														
 
															+        committerConfig
														
 
															+            .createStageConfig()
														
 
															+            .withOperations(createManifestStoreOperations())
														
 
															+            .build();
														
 
															+    // set up the job.
														
 
															+    new SetupJobStage(stageConfig)
														
 
															+        .apply(committerConfig.getCreateJobMarker());
														
 
															+    logCommitterStatisticsAtDebug();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set up a task through a {@link SetupTaskStage}.
														
 
															+   * Classic FileOutputCommitter is a no-op here, relying
														
 
															+   * on RecordWriters to create the dir implicitly on file
														
 
															+   * create().
														
 
															+   * FileOutputCommitter also uses the existence of that
														
 
															+   * file as a flag to indicate task commit is needed.
														
 
															+   * @param context task context.
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void setupTask(final TaskAttemptContext context)
														
 
															+      throws IOException {
														
 
															+    ManifestCommitterConfig committerConfig =
														
 
															+        enterCommitter(true, context);
														
 
															+    StageConfig stageConfig =
														
 
															+        committerConfig
														
 
															+            .createStageConfig()
														
 
															+            .withOperations(createManifestStoreOperations())
														
 
															+            .build();
														
 
															+    // create task attempt dir; delete if present. Or fail?
														
 
															+    new SetupTaskStage(stageConfig).apply("");
														
 
															+    logCommitterStatisticsAtDebug();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Always return true.
														
 
															+   * This way, even if there is no output, stats are collected.
														
 
															+   * @param context task context.
														
 
															+   * @return true
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public boolean needsTaskCommit(final TaskAttemptContext context)
														
 
															+      throws IOException {
														
 
															+    LOG.info("Probe for needsTaskCommit({})",
														
 
															+        context.getTaskAttemptID());
														
 
															+    return true;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Failure during Job Commit is not recoverable from.
														
 
															+   *
														
 
															+   * @param jobContext
														
 
															+   *          Context of the job whose output is being written.
														
 
															+   * @return false, always
														
 
															+   * @throws IOException never
														
 
															+   */
														
 
															+  @Override
														
 
															+  public boolean isCommitJobRepeatable(final JobContext jobContext)
														
 
															+      throws IOException {
														
 
															+    LOG.info("Probe for isCommitJobRepeatable({}): returning false",
														
 
															+        jobContext.getJobID());
														
 
															+    return false;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Declare that task recovery is not supported.
														
 
															+   * It would be, if someone added the code *and tests*.
														
 
															+   * @param jobContext
														
 
															+   *          Context of the job whose output is being written.
														
 
															+   * @return false, always
														
 
															+   * @throws IOException never
														
 
															+   */
														
 
															+  @Override
														
 
															+  public boolean isRecoverySupported(final JobContext jobContext)
														
 
															+      throws IOException {
														
 
															+    LOG.info("Probe for isRecoverySupported({}): returning false",
														
 
															+        jobContext.getJobID());
														
 
															+    return false;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   *
														
 
															+   * @param taskContext Context of the task whose output is being recovered
														
 
															+   * @throws IOException always
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void recoverTask(final TaskAttemptContext taskContext)
														
 
															+      throws IOException {
														
 
															+    LOG.warn("Rejecting recoverTask({}) call", taskContext.getTaskAttemptID());
														
 
															+    throw new IOException("Cannot recover task "
														
 
															+        + taskContext.getTaskAttemptID());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Commit the task.
														
 
															+   * This is where the task attempt tree list takes place.
														
 
															+   * @param context task context.
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void commitTask(final TaskAttemptContext context)
														
 
															+      throws IOException {
														
 
															+    ManifestCommitterConfig committerConfig = enterCommitter(true,
														
 
															+        context);
														
 
															+    try {
														
 
															+      StageConfig stageConfig = committerConfig.createStageConfig()
														
 
															+          .withOperations(createManifestStoreOperations())
														
 
															+          .build();
														
 
															+      taskAttemptCommittedManifest = new CommitTaskStage(stageConfig)
														
 
															+          .apply(null).getTaskManifest();
														
 
															+      iostatistics.incrementCounter(COMMITTER_TASKS_COMPLETED_COUNT, 1);
														
 
															+    } catch (IOException e) {
														
 
															+      iostatistics.incrementCounter(COMMITTER_TASKS_FAILED_COUNT, 1);
														
 
															+      throw e;
														
 
															+    } finally {
														
 
															+      logCommitterStatisticsAtDebug();
														
 
															+      updateCommonContextOnCommitterExit();
														
 
															+    }
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Abort a task.
														
 
															+   * @param context task context
														
 
															+   * @throws IOException failure during the delete
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void abortTask(final TaskAttemptContext context)
														
 
															+      throws IOException {
														
 
															+    ManifestCommitterConfig committerConfig = enterCommitter(true,
														
 
															+        context);
														
 
															+    try {
														
 
															+      new AbortTaskStage(
														
 
															+          committerConfig.createStageConfig()
														
 
															+              .withOperations(createManifestStoreOperations())
														
 
															+              .build())
														
 
															+          .apply(false);
														
 
															+    } finally {
														
 
															+      logCommitterStatisticsAtDebug();
														
 
															+      updateCommonContextOnCommitterExit();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the manifest success data for this job; creating on demand if needed.
														
 
															+   * @param committerConfig source config.
														
 
															+   * @return the current {@link #successReport} value; never null.
														
 
															+   */
														
 
															+  private ManifestSuccessData getOrCreateSuccessData(
														
 
															+      ManifestCommitterConfig committerConfig) {
														
 
															+    if (successReport == null) {
														
 
															+      successReport = createManifestOutcome(
														
 
															+          committerConfig.createStageConfig(), activeStage);
														
 
															+    }
														
 
															+    return successReport;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * This is the big job commit stage.
														
 
															+   * Load the manifests, prepare the destination, rename
														
 
															+   * the files then cleanup the job directory.
														
 
															+   * @param jobContext Context of the job whose output is being written.
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void commitJob(final JobContext jobContext) throws IOException {
														
 
															+
														
 
															+    ManifestCommitterConfig committerConfig = enterCommitter(false, jobContext);
														
 
															+
														
 
															+    // create the initial success data.
														
 
															+    // this is overwritten by that created during the operation sequence,
														
 
															+    // but if the sequence fails before that happens, it
														
 
															+    // will be saved to the report directory.
														
 
															+    ManifestSuccessData marker = getOrCreateSuccessData(committerConfig);
														
 
															+    IOException failure = null;
														
 
															+    try (CloseableTaskPoolSubmitter ioProcs =
														
 
															+             committerConfig.createSubmitter();
														
 
															+         ManifestStoreOperations storeOperations = createManifestStoreOperations()) {
														
 
															+      // the stage config will be shared across all stages.
														
 
															+      StageConfig stageConfig = committerConfig.createStageConfig()
														
 
															+          .withOperations(storeOperations)
														
 
															+          .withIOProcessors(ioProcs)
														
 
															+          .build();
														
 
															+
														
 
															+      // commit the job, including any cleanup and validation.
														
 
															+      final Configuration conf = jobContext.getConfiguration();
														
 
															+      CommitJobStage.Result result = new CommitJobStage(stageConfig).apply(
														
 
															+          new CommitJobStage.Arguments(
														
 
															+              committerConfig.getCreateJobMarker(),
														
 
															+              committerConfig.getValidateOutput(),
														
 
															+              conf.getTrimmed(OPT_DIAGNOSTICS_MANIFEST_DIR, ""),
														
 
															+              cleanupStageOptionsFromConfig(
														
 
															+                  OP_STAGE_JOB_CLEANUP, conf)
														
 
															+          ));
														
 
															+      marker = result.getJobSuccessData();
														
 
															+      // update the cached success with the new report.
														
 
															+      setSuccessReport(marker);
														
 
															+
														
 
															+    } catch (IOException e) {
														
 
															+      // failure. record it for the summary
														
 
															+      failure = e;
														
 
															+      // rethrow
														
 
															+      throw e;
														
 
															+    } finally {
														
 
															+      // save the report summary, even on failure
														
 
															+      maybeSaveSummary(activeStage,
														
 
															+          committerConfig,
														
 
															+          marker,
														
 
															+          failure,
														
 
															+          true,
														
 
															+          true);
														
 
															+      // print job commit stats
														
 
															+      LOG.info("{}: Job Commit statistics {}",
														
 
															+          committerConfig.getName(),
														
 
															+          ioStatisticsToPrettyString(iostatistics));
														
 
															+      // and warn of rename problems
														
 
															+      final Long recoveries = iostatistics.counters().get(OP_COMMIT_FILE_RENAME_RECOVERED);
														
 
															+      if (recoveries != null && recoveries > 0) {
														
 
															+        LOG.warn("{}: rename failures were recovered from. Number of recoveries: {}",
														
 
															+            committerConfig.getName(), recoveries);
														
 
															+      }
														
 
															+      updateCommonContextOnCommitterExit();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Abort the job.
														
 
															+   * Invokes
														
 
															+   * {@link #executeCleanup(String, JobContext, ManifestCommitterConfig)}
														
 
															+   * then saves the (ongoing) job report data if reporting is enabled.
														
 
															+   * @param jobContext Context of the job whose output is being written.
														
 
															+   * @param state final runstate of the job
														
 
															+   * @throws IOException failure during cleanup; report failure are swallowed
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void abortJob(final JobContext jobContext,
														
 
															+      final JobStatus.State state)
														
 
															+      throws IOException {
														
 
															+    LOG.info("Aborting Job {} in state {}", jobContext.getJobID(), state);
														
 
															+    ManifestCommitterConfig committerConfig = enterCommitter(false,
														
 
															+        jobContext);
														
 
															+    ManifestSuccessData report = getOrCreateSuccessData(
														
 
															+        committerConfig);
														
 
															+    IOException failure = null;
														
 
															+
														
 
															+    try {
														
 
															+      executeCleanup(OP_STAGE_JOB_ABORT, jobContext, committerConfig);
														
 
															+    } catch (IOException e) {
														
 
															+      // failure.
														
 
															+      failure = e;
														
 
															+    }
														
 
															+    report.setSuccess(false);
														
 
															+    // job abort does not overwrite any existing report, so a job commit
														
 
															+    // failure cause will be preserved.
														
 
															+    maybeSaveSummary(activeStage, committerConfig, report, failure,
														
 
															+        true, false);
														
 
															+    // print job stats
														
 
															+    LOG.info("Job Abort statistics {}",
														
 
															+        ioStatisticsToPrettyString(iostatistics));
														
 
															+    updateCommonContextOnCommitterExit();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute the {@code CleanupJobStage} to remove the job attempt dir.
														
 
															+   * This does
														
 
															+   * @param jobContext Context of the job whose output is being written.
														
 
															+   * @throws IOException failure during cleanup
														
 
															+   */
														
 
															+  @SuppressWarnings("deprecation")
														
 
															+  @Override
														
 
															+  public void cleanupJob(final JobContext jobContext) throws IOException {
														
 
															+    ManifestCommitterConfig committerConfig = enterCommitter(false,
														
 
															+        jobContext);
														
 
															+    try {
														
 
															+      executeCleanup(OP_STAGE_JOB_CLEANUP, jobContext, committerConfig);
														
 
															+    } finally {
														
 
															+      logCommitterStatisticsAtDebug();
														
 
															+      updateCommonContextOnCommitterExit();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Perform the cleanup operation for job cleanup or abort.
														
 
															+   * @param statisticName statistic/stage name
														
 
															+   * @param jobContext job context
														
 
															+   * @param committerConfig committer config
														
 
															+   * @throws IOException failure
														
 
															+   * @return the outcome
														
 
															+   */
														
 
															+  private CleanupJobStage.Result executeCleanup(
														
 
															+      final String statisticName,
														
 
															+      final JobContext jobContext,
														
 
															+      final ManifestCommitterConfig committerConfig) throws IOException {
														
 
															+    try (CloseableTaskPoolSubmitter ioProcs =
														
 
															+             committerConfig.createSubmitter()) {
														
 
															+
														
 
															+      return new CleanupJobStage(
														
 
															+          committerConfig.createStageConfig()
														
 
															+              .withOperations(createManifestStoreOperations())
														
 
															+              .withIOProcessors(ioProcs)
														
 
															+              .build())
														
 
															+          .apply(cleanupStageOptionsFromConfig(
														
 
															+              statisticName,
														
 
															+              jobContext.getConfiguration()));
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Output path: destination directory of the job.
														
 
															+   * @return the overall job destination directory.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public Path getOutputPath() {
														
 
															+    return getDestinationDir();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Work path of the current task attempt.
														
 
															+   * This is null if the task does not have one.
														
 
															+   * @return a path.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public Path getWorkPath() {
														
 
															+    return getTaskAttemptDir();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the job destination dir.
														
 
															+   * @return dest dir.
														
 
															+   */
														
 
															+  private Path getDestinationDir() {
														
 
															+    return destinationDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the task attempt dir.
														
 
															+   * May be null.
														
 
															+   * @return a path or null.
														
 
															+   */
														
 
															+  private Path getTaskAttemptDir() {
														
 
															+    return taskAttemptDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Callback on stage entry.
														
 
															+   * Sets {@link #activeStage} and updates the
														
 
															+   * common context.
														
 
															+   * @param stage new stage
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void enterStage(String stage) {
														
 
															+    activeStage = stage;
														
 
															+    AuditingIntegration.enterStage(stage);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Remove stage from common audit context.
														
 
															+   * @param stage stage exited.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void exitStage(String stage) {
														
 
															+    AuditingIntegration.exitStage();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the unique ID of this job.
														
 
															+   * @return job ID (yarn, spark)
														
 
															+   */
														
 
															+  public String getJobUniqueId() {
														
 
															+    return baseConfig.getJobUniqueId();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the config of the task attempt this instance was constructed
														
 
															+   * with.
														
 
															+   * @return a configuration.
														
 
															+   */
														
 
															+  public Configuration getConf() {
														
 
															+    return baseConfig.getConf();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the manifest Success data; only valid after a job.
														
 
															+   * @return the job _SUCCESS data, or null.
														
 
															+   */
														
 
															+  public ManifestSuccessData getSuccessReport() {
														
 
															+    return successReport;
														
 
															+  }
														
 
															+
														
 
															+  private void setSuccessReport(ManifestSuccessData successReport) {
														
 
															+    this.successReport = successReport;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the manifest of the last committed task.
														
 
															+   * @return a task manifest or null.
														
 
															+   */
														
 
															+  @VisibleForTesting
														
 
															+  TaskManifest getTaskAttemptCommittedManifest() {
														
 
															+    return taskAttemptCommittedManifest;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Compute the path where the output of a task attempt is stored until
														
 
															+   * that task is committed.
														
 
															+   * @param context the context of the task attempt.
														
 
															+   * @return the path where a task attempt should be stored.
														
 
															+   */
														
 
															+  @VisibleForTesting
														
 
															+  public Path getTaskAttemptPath(TaskAttemptContext context) {
														
 
															+    return enterCommitter(false, context).getTaskAttemptDir();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * The path to where the manifest file of a task attempt will be
														
 
															+   * saved when the task is committed.
														
 
															+   * This path will be the same for all attempts of the same task.
														
 
															+   * @param context the context of the task attempt.
														
 
															+   * @return the path where a task attempt should be stored.
														
 
															+   */
														
 
															+  @VisibleForTesting
														
 
															+  public Path getTaskManifestPath(TaskAttemptContext context) {
														
 
															+    final Path dir = enterCommitter(false, context).getTaskManifestDir();
														
 
															+
														
 
															+    return manifestPathForTask(dir,
														
 
															+        context.getTaskAttemptID().getTaskID().toString());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Compute the path where the output of a task attempt is stored until
														
 
															+   * that task is committed.
														
 
															+   * @param context the context of the task attempt.
														
 
															+   * @return the path where a task attempt should be stored.
														
 
															+   */
														
 
															+  @VisibleForTesting
														
 
															+  public Path getJobAttemptPath(JobContext context) {
														
 
															+
														
 
															+    return enterCommitter(false, context).getJobAttemptDir();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the final output path, including resolving any relative path.
														
 
															+   * @param outputPath output path
														
 
															+   * @param conf configuration to create any FS with
														
 
															+   * @return a resolved path.
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  private Path resolveDestinationDirectory(Path outputPath,
														
 
															+      Configuration conf) throws IOException {
														
 
															+    return FileSystem.get(outputPath.toUri(), conf).makeQualified(outputPath);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create manifest store operations for the destination store.
														
 
															+   * This MUST NOT be used for the success report operations, as
														
 
															+   * they may be to a different filesystem.
														
 
															+   * This is a point which can be overridden during testing.
														
 
															+   * @return a new store operations instance bonded to the destination fs.
														
 
															+   * @throws IOException failure to instantiate.
														
 
															+   */
														
 
															+  protected ManifestStoreOperations createManifestStoreOperations() throws IOException {
														
 
															+    return ManifestCommitterSupport.createManifestStoreOperations(
														
 
															+        baseConfig.getConf(),
														
 
															+        baseConfig.getDestinationFileSystem(),
														
 
															+        baseConfig.getDestinationDir());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Log IO Statistics at debug.
														
 
															+   */
														
 
															+  private void logCommitterStatisticsAtDebug() {
														
 
															+    logIOStatisticsAtDebug(LOG, "Committer Statistics", this);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public String toString() {
														
 
															+    final StringBuilder sb = new StringBuilder(
														
 
															+        "ManifestCommitter{");
														
 
															+    sb.append(baseConfig);
														
 
															+    sb.append(", iostatistics=").append(ioStatisticsToPrettyString(iostatistics));
														
 
															+    sb.append('}');
														
 
															+    return sb.toString();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Save a summary to the report dir if the config option
														
 
															+   * is set.
														
 
															+   * The IOStatistics of the summary will be updated to the latest
														
 
															+   * snapshot of the committer's statistics, so the report is up
														
 
															+   * to date.
														
 
															+   * The report will updated with the current active stage,
														
 
															+   * and if {@code thrown} is non-null, it will be added to the
														
 
															+   * diagnistics (and the job tagged as a failure).
														
 
															+   * Static for testability.
														
 
															+   * @param activeStage active stage
														
 
															+   * @param config configuration to use.
														
 
															+   * @param report summary file.
														
 
															+   * @param thrown any exception indicting failure.
														
 
															+   * @param quiet should exceptions be swallowed.
														
 
															+   * @param overwrite should the existing file be overwritten
														
 
															+   * @return the path of a file, if successfully saved
														
 
															+   * @throws IOException if a failure occured and quiet==false
														
 
															+   */
														
 
															+  private static Path maybeSaveSummary(
														
 
															+      String activeStage,
														
 
															+      ManifestCommitterConfig config,
														
 
															+      ManifestSuccessData report,
														
 
															+      Throwable thrown,
														
 
															+      boolean quiet,
														
 
															+      boolean overwrite) throws IOException {
														
 
															+    Configuration conf = config.getConf();
														
 
															+    String reportDir = conf.getTrimmed(OPT_SUMMARY_REPORT_DIR, "");
														
 
															+    if (reportDir.isEmpty()) {
														
 
															+      LOG.debug("No summary directory set in " + OPT_SUMMARY_REPORT_DIR);
														
 
															+      return null;
														
 
															+    }
														
 
															+    LOG.debug("Summary directory set in to {}" + OPT_SUMMARY_REPORT_DIR,
														
 
															+        reportDir);
														
 
															+
														
 
															+    // update to the latest statistics
														
 
															+    report.snapshotIOStatistics(config.getIOStatistics());
														
 
															+
														
 
															+    Path reportDirPath = new Path(reportDir);
														
 
															+    Path path = new Path(reportDirPath,
														
 
															+        createJobSummaryFilename(config.getJobUniqueId()));
														
 
															+
														
 
															+    if (thrown != null) {
														
 
															+      report.recordJobFailure(thrown);
														
 
															+    }
														
 
															+    report.putDiagnostic(STAGE, activeStage);
														
 
															+    // the store operations here is explicitly created for the FS where
														
 
															+    // the reports go, which may not be the target FS of the job.
														
 
															+
														
 
															+    final FileSystem fs = path.getFileSystem(conf);
														
 
															+    try (ManifestStoreOperations operations = new ManifestStoreOperationsThroughFileSystem(fs)) {
														
 
															+      if (!overwrite) {
														
 
															+        // check for file existence so there is no need to worry about
														
 
															+        // precisely what exception is raised when overwrite=false and dest file
														
 
															+        // exists
														
 
															+        try {
														
 
															+          FileStatus st = operations.getFileStatus(path);
														
 
															+          // get here and the file exists
														
 
															+          LOG.debug("Report already exists: {}", st);
														
 
															+          return null;
														
 
															+        } catch (FileNotFoundException ignored) {
														
 
															+        }
														
 
															+      }
														
 
															+      operations.save(report, path, overwrite);
														
 
															+      LOG.info("Job summary saved to {}", path);
														
 
															+      return path;
														
 
															+    } catch (IOException e) {
														
 
															+      LOG.debug("Failed to save summary to {}", path, e);
														
 
															+      if (quiet) {
														
 
															+        return null;
														
 
															+      } else {
														
 
															+        throw e;
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public IOStatisticsStore getIOStatistics() {
														
 
															+    return iostatistics;
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConfig.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConfig.java
@@ -0,0 +1,375 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.Objects;
														
 
															+
														
 
															+import org.apache.commons.lang3.tuple.Pair;
														
 
															+import org.apache.hadoop.conf.Configuration;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.mapreduce.JobContext;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptContext;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptID;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageEventCallbacks;
														
 
															+import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
														
 
															+import org.apache.hadoop.util.Progressable;
														
 
															+import org.apache.hadoop.util.concurrent.HadoopExecutors;
														
 
															+import org.apache.hadoop.util.functional.CloseableTaskPoolSubmitter;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.*;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.buildJobUUID;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.getAppAttemptId;
														
 
															+
														
 
															+/**
														
 
															+ * The configuration for the committer as built up from the job configuration
														
 
															+ * and data passed down from the committer factory.
														
 
															+ * Isolated for ease of dev/test
														
 
															+ */
														
 
															+public final class ManifestCommitterConfig implements IOStatisticsSource {
														
 
															+
														
 
															+  /**
														
 
															+   * Final destination of work.
														
 
															+   * This is <i>unqualified</i>.
														
 
															+   */
														
 
															+  private final Path destinationDir;
														
 
															+
														
 
															+  /**
														
 
															+   * Role: used in log/text messages.
														
 
															+   */
														
 
															+  private final String role;
														
 
															+
														
 
															+  /**
														
 
															+   * This is the directory for all intermediate work: where the output
														
 
															+   * format will write data.
														
 
															+   * Will be null if built from a job context.
														
 
															+   */
														
 
															+  private final Path taskAttemptDir;
														
 
															+
														
 
															+  /** Configuration of the job. */
														
 
															+  private final Configuration conf;
														
 
															+
														
 
															+  /** The job context. For a task, this can be cast to a TaskContext. */
														
 
															+  private final JobContext jobContext;
														
 
															+
														
 
															+  /** Should a job marker be created? */
														
 
															+  private final boolean createJobMarker;
														
 
															+
														
 
															+  /**
														
 
															+   * Job ID Or UUID -without any attempt suffix.
														
 
															+   * This is expected/required to be unique, though
														
 
															+   * Spark has had "issues" there until recently
														
 
															+   * with lack of uniqueness of generated MR Job IDs.
														
 
															+   */
														
 
															+  private final String jobUniqueId;
														
 
															+
														
 
															+  /**
														
 
															+   * Where did the job Unique ID come from?
														
 
															+   */
														
 
															+  private final String jobUniqueIdSource;
														
 
															+
														
 
															+  /**
														
 
															+   * Number of this attempt; starts at zero.
														
 
															+   */
														
 
															+  private final int jobAttemptNumber;
														
 
															+
														
 
															+  /**
														
 
															+   * Job ID + AttemptID.
														
 
															+   */
														
 
															+  private final String jobAttemptId;
														
 
															+
														
 
															+  /**
														
 
															+   * Task ID: used as the filename of the manifest.
														
 
															+   * Will be "" if built from a job context.
														
 
															+   */
														
 
															+  private final String taskId;
														
 
															+
														
 
															+  /**
														
 
															+   * Task attempt ID. Determines the working
														
 
															+   * directory for task attempts to write data into,
														
 
															+   * and for the task committer to scan.
														
 
															+   * Will be "" if built from a job context.
														
 
															+   */
														
 
															+  private final String taskAttemptId;
														
 
															+
														
 
															+  /** Any progressable for progress callbacks. */
														
 
															+  private final Progressable progressable;
														
 
															+
														
 
															+  /**
														
 
															+   * IOStatistics to update.
														
 
															+   */
														
 
															+  private final IOStatisticsStore iostatistics;
														
 
															+
														
 
															+
														
 
															+  /** Should the output be validated after the commit? */
														
 
															+  private final boolean validateOutput;
														
 
															+
														
 
															+  /**
														
 
															+   * Attempt directory management.
														
 
															+   */
														
 
															+  private final ManifestCommitterSupport.AttemptDirectories dirs;
														
 
															+
														
 
															+  /**
														
 
															+   * Callback when a stage is entered.
														
 
															+   */
														
 
															+  private final StageEventCallbacks stageEventCallbacks;
														
 
															+
														
 
															+  /**
														
 
															+   * Name for logging.
														
 
															+   */
														
 
															+  private final String name;
														
 
															+
														
 
															+  /**
														
 
															+   * Delete target paths on commit? Stricter, but
														
 
															+   * higher IO cost.
														
 
															+   */
														
 
															+  private final boolean deleteTargetPaths;
														
 
															+
														
 
															+  /**
														
 
															+   * Constructor.
														
 
															+   * @param outputPath destination path of the job.
														
 
															+   * @param role role for log messages.
														
 
															+   * @param context job/task context
														
 
															+   * @param iostatistics IO Statistics
														
 
															+   * @param stageEventCallbacks stage event callbacks.
														
 
															+   */
														
 
															+
														
 
															+  ManifestCommitterConfig(
														
 
															+      final Path outputPath,
														
 
															+      final String role,
														
 
															+      final JobContext context,
														
 
															+      final IOStatisticsStore iostatistics,
														
 
															+      final StageEventCallbacks stageEventCallbacks) {
														
 
															+    this.role = role;
														
 
															+    this.jobContext = context;
														
 
															+    this.conf = context.getConfiguration();
														
 
															+    this.destinationDir = outputPath;
														
 
															+    this.iostatistics = iostatistics;
														
 
															+    this.stageEventCallbacks = stageEventCallbacks;
														
 
															+
														
 
															+    Pair<String, String> pair = buildJobUUID(conf, context.getJobID());
														
 
															+    this.jobUniqueId = pair.getLeft();
														
 
															+    this.jobUniqueIdSource = pair.getRight();
														
 
															+    this.jobAttemptNumber = getAppAttemptId(context);
														
 
															+    this.jobAttemptId = this.jobUniqueId + "_" + jobAttemptNumber;
														
 
															+
														
 
															+    // build directories
														
 
															+    this.dirs = new ManifestCommitterSupport.AttemptDirectories(outputPath,
														
 
															+        this.jobUniqueId, jobAttemptNumber);
														
 
															+
														
 
															+    // read in configuration options
														
 
															+    this.createJobMarker = conf.getBoolean(
														
 
															+        SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
														
 
															+        DEFAULT_CREATE_SUCCESSFUL_JOB_DIR_MARKER);
														
 
															+    this.validateOutput = conf.getBoolean(
														
 
															+        OPT_VALIDATE_OUTPUT,
														
 
															+        OPT_VALIDATE_OUTPUT_DEFAULT);
														
 
															+    this.deleteTargetPaths = conf.getBoolean(
														
 
															+        OPT_DELETE_TARGET_FILES,
														
 
															+        OPT_DELETE_TARGET_FILES_DEFAULT);
														
 
															+
														
 
															+    // if constructed with a task attempt, build the task ID and path.
														
 
															+    if (context instanceof TaskAttemptContext) {
														
 
															+      // it's a task
														
 
															+      final TaskAttemptContext tac = (TaskAttemptContext) context;
														
 
															+      TaskAttemptID taskAttempt = Objects.requireNonNull(
														
 
															+          tac.getTaskAttemptID());
														
 
															+      taskAttemptId = taskAttempt.toString();
														
 
															+      taskId = taskAttempt.getTaskID().toString();
														
 
															+      // Task attempt dir; must be different across instances
														
 
															+      taskAttemptDir = dirs.getTaskAttemptPath(taskAttemptId);
														
 
															+      // the context is also the progress callback.
														
 
															+      progressable = tac;
														
 
															+      name = String.format(InternalConstants.NAME_FORMAT_TASK_ATTEMPT, taskAttemptId);
														
 
															+
														
 
															+    } else {
														
 
															+      // it's a job
														
 
															+      taskId = "";
														
 
															+      taskAttemptId = "";
														
 
															+      taskAttemptDir = null;
														
 
															+      progressable = null;
														
 
															+      name = String.format(InternalConstants.NAME_FORMAT_JOB_ATTEMPT, jobAttemptId);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public String toString() {
														
 
															+    return "ManifestCommitterConfig{" +
														
 
															+        "name=" + name +
														
 
															+        ", destinationDir=" + destinationDir +
														
 
															+        ", role='" + role + '\'' +
														
 
															+        ", taskAttemptDir=" + taskAttemptDir +
														
 
															+        ", createJobMarker=" + createJobMarker +
														
 
															+        ", jobUniqueId='" + jobUniqueId + '\'' +
														
 
															+        ", jobUniqueIdSource='" + jobUniqueIdSource + '\'' +
														
 
															+        ", jobAttemptNumber=" + jobAttemptNumber +
														
 
															+        ", jobAttemptId='" + jobAttemptId + '\'' +
														
 
															+        ", taskId='" + taskId + '\'' +
														
 
															+        ", taskAttemptId='" + taskAttemptId + '\'' +
														
 
															+        '}';
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the destination filesystem.
														
 
															+   * @return destination FS.
														
 
															+   * @throws IOException Problems binding to the destination FS.
														
 
															+   */
														
 
															+  FileSystem getDestinationFileSystem() throws IOException {
														
 
															+    return FileSystem.get(destinationDir.toUri(), conf);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create the stage config from the committer
														
 
															+   * configuration.
														
 
															+   * This does not bind the store operations
														
 
															+   * or processors.
														
 
															+   * @return a stage config with configuration options passed in.
														
 
															+   */
														
 
															+  StageConfig createStageConfig() {
														
 
															+    StageConfig stageConfig = new StageConfig();
														
 
															+    stageConfig
														
 
															+        .withIOStatistics(iostatistics)
														
 
															+        .withJobAttemptNumber(jobAttemptNumber)
														
 
															+        .withJobDirectories(dirs)
														
 
															+        .withJobId(jobUniqueId)
														
 
															+        .withJobIdSource(jobUniqueIdSource)
														
 
															+        .withName(name)
														
 
															+        .withProgressable(progressable)
														
 
															+        .withStageEventCallbacks(stageEventCallbacks)
														
 
															+        .withTaskAttemptDir(taskAttemptDir)
														
 
															+        .withTaskAttemptId(taskAttemptId)
														
 
															+        .withTaskId(taskId)
														
 
															+        .withDeleteTargetPaths(deleteTargetPaths);
														
 
															+
														
 
															+    return stageConfig;
														
 
															+  }
														
 
															+
														
 
															+  public Path getDestinationDir() {
														
 
															+    return destinationDir;
														
 
															+  }
														
 
															+
														
 
															+  public String getRole() {
														
 
															+    return role;
														
 
															+  }
														
 
															+
														
 
															+  public Path getTaskAttemptDir() {
														
 
															+    return taskAttemptDir;
														
 
															+  }
														
 
															+
														
 
															+  public Path getJobAttemptDir() {
														
 
															+    return dirs.getJobAttemptDir();
														
 
															+  }
														
 
															+
														
 
															+  public Path getTaskManifestDir() {
														
 
															+    return dirs.getTaskManifestDir();
														
 
															+  }
														
 
															+
														
 
															+  public Configuration getConf() {
														
 
															+    return conf;
														
 
															+  }
														
 
															+
														
 
															+  public JobContext getJobContext() {
														
 
															+    return jobContext;
														
 
															+  }
														
 
															+
														
 
															+  public boolean getCreateJobMarker() {
														
 
															+    return createJobMarker;
														
 
															+  }
														
 
															+
														
 
															+  public String getJobAttemptId() {
														
 
															+    return jobAttemptId;
														
 
															+  }
														
 
															+
														
 
															+  public String getTaskAttemptId() {
														
 
															+    return taskAttemptId;
														
 
															+  }
														
 
															+
														
 
															+  public String getTaskId() {
														
 
															+    return taskId;
														
 
															+  }
														
 
															+
														
 
															+  public String getJobUniqueId() {
														
 
															+    return jobUniqueId;
														
 
															+  }
														
 
															+
														
 
															+  public boolean getValidateOutput() {
														
 
															+    return validateOutput;
														
 
															+  }
														
 
															+
														
 
															+  public String getName() {
														
 
															+    return name;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public IOStatisticsStore getIOStatistics() {
														
 
															+    return iostatistics;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a new submitter task pool from the
														
 
															+   * {@link ManifestCommitterConstants#OPT_IO_PROCESSORS}
														
 
															+   * settings.
														
 
															+   * @return a new thread pool.
														
 
															+   */
														
 
															+  public CloseableTaskPoolSubmitter createSubmitter() {
														
 
															+    return createSubmitter(
														
 
															+        OPT_IO_PROCESSORS, OPT_IO_PROCESSORS_DEFAULT);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a new submitter task pool.
														
 
															+   * @param key config key with pool size.
														
 
															+   * @param defVal default value.
														
 
															+   * @return a new task pool.
														
 
															+   */
														
 
															+  public CloseableTaskPoolSubmitter createSubmitter(String key, int defVal) {
														
 
															+    int numThreads = conf.getInt(key, defVal);
														
 
															+    if (numThreads <= 0) {
														
 
															+      // ignore the setting if it is too invalid.
														
 
															+      numThreads = defVal;
														
 
															+    }
														
 
															+    return createCloseableTaskSubmitter(numThreads, getJobAttemptId());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a new submitter task pool.
														
 
															+   *
														
 
															+   * @param numThreads thread count.
														
 
															+   * @param jobAttemptId job ID
														
 
															+   * @return a new task pool.
														
 
															+   */
														
 
															+  public static CloseableTaskPoolSubmitter createCloseableTaskSubmitter(
														
 
															+      final int numThreads,
														
 
															+      final String jobAttemptId) {
														
 
															+    return new CloseableTaskPoolSubmitter(
														
 
															+        HadoopExecutors.newFixedThreadPool(numThreads,
														
 
															+            new ThreadFactoryBuilder()
														
 
															+                .setDaemon(true)
														
 
															+                .setNameFormat("manifest-committer-" + jobAttemptId + "-%d")
														
 
															+                .build()));
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConstants.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConstants.java
@@ -0,0 +1,240 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperationsThroughFileSystem;
														
 
															+
														
 
															+/**
														
 
															+ * Public constants for the manifest committer.
														
 
															+ * This includes all configuration options and their default values.
														
 
															+ */
														
 
															+@InterfaceAudience.Public
														
 
															+@InterfaceStability.Unstable
														
 
															+public final class ManifestCommitterConstants {
														
 
															+
														
 
															+  /**
														
 
															+   * Suffix to use in manifest files in the manifest subdir.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String MANIFEST_SUFFIX = "-manifest.json";
														
 
															+
														
 
															+  /**
														
 
															+   * Prefix for summary files in the report dir. Call
														
 
															+   */
														
 
															+  public static final String SUMMARY_FILENAME_PREFIX = "summary-";
														
 
															+
														
 
															+  /**
														
 
															+   * Format string used to build a summary file from a Job ID.
														
 
															+   */
														
 
															+  public static final String SUMMARY_FILENAME_FORMAT =
														
 
															+      SUMMARY_FILENAME_PREFIX + "%s.json";
														
 
															+
														
 
															+  /**
														
 
															+   * Suffix to use for temp files before renaming them.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String TMP_SUFFIX = ".tmp";
														
 
															+
														
 
															+  /**
														
 
															+   * Initial number of all app attempts.
														
 
															+   * This is fixed in YARN; for Spark jobs the
														
 
															+   * same number "0" is used.
														
 
															+   */
														
 
															+  public static final int INITIAL_APP_ATTEMPT_ID = 0;
														
 
															+
														
 
															+  /**
														
 
															+   * Format string for building a job dir.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String JOB_DIR_FORMAT_STR = "%s";
														
 
															+
														
 
															+  /**
														
 
															+   * Format string for building a job attempt dir.
														
 
															+   * This uses the job attempt number so previous versions
														
 
															+   * can be found trivially.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String JOB_ATTEMPT_DIR_FORMAT_STR = "%02d";
														
 
															+
														
 
															+  /**
														
 
															+   * Name of directory under job attempt dir for manifests.
														
 
															+   */
														
 
															+  public static final String JOB_TASK_MANIFEST_SUBDIR = "manifests";
														
 
															+
														
 
															+  /**
														
 
															+   * Name of directory under job attempt dir for task attempts.
														
 
															+   */
														
 
															+  public static final String JOB_TASK_ATTEMPT_SUBDIR = "tasks";
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Committer classname as recorded in the committer _SUCCESS file.
														
 
															+   */
														
 
															+  public static final String MANIFEST_COMMITTER_CLASSNAME =
														
 
															+      ManifestCommitter.class.getName();
														
 
															+
														
 
															+  /**
														
 
															+   * Marker file to create on success: {@value}.
														
 
															+   */
														
 
															+  public static final String SUCCESS_MARKER = "_SUCCESS";
														
 
															+
														
 
															+  /** Default job marker option: {@value}. */
														
 
															+  public static final boolean DEFAULT_CREATE_SUCCESSFUL_JOB_DIR_MARKER = true;
														
 
															+
														
 
															+  /**
														
 
															+   * The limit to the number of committed objects tracked during
														
 
															+   * job commits and saved to the _SUCCESS file.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final int SUCCESS_MARKER_FILE_LIMIT = 100;
														
 
															+
														
 
															+  /**
														
 
															+   * The UUID for jobs: {@value}.
														
 
															+   * This was historically created in Spark 1.x's SQL queries,
														
 
															+   * but "went away".
														
 
															+   * It has been restored in recent spark releases.
														
 
															+   * If found: it is used instead of the MR job attempt ID.
														
 
															+   */
														
 
															+  public static final String SPARK_WRITE_UUID = "spark.sql.sources.writeJobUUID";
														
 
															+
														
 
															+  /**
														
 
															+   * String to use as source of the job ID.
														
 
															+   * This SHOULD be kept in sync with that of
														
 
															+   * {@code AbstractS3ACommitter.JobUUIDSource}.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String JOB_ID_SOURCE_MAPREDUCE = "JobID";
														
 
															+
														
 
															+  /**
														
 
															+   * Prefix to use for config options: {@value}.
														
 
															+   */
														
 
															+  public static final String OPT_PREFIX = "mapreduce.manifest.committer.";
														
 
															+
														
 
															+  /**
														
 
															+   * Should dir cleanup do parallel deletion of task attempt dirs
														
 
															+   * before trying to delete the toplevel dirs.
														
 
															+   * For GCS this may deliver speedup, while on ABFS it may avoid
														
 
															+   * timeouts in certain deployments.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String OPT_CLEANUP_PARALLEL_DELETE =
														
 
															+      OPT_PREFIX + "cleanup.parallel.delete";
														
 
															+
														
 
															+  /**
														
 
															+   * Default value:  {@value}.
														
 
															+   */
														
 
															+  public static final boolean OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT = true;
														
 
															+
														
 
															+  /**
														
 
															+   * Threads to use for IO.
														
 
															+   */
														
 
															+  public static final String OPT_IO_PROCESSORS = OPT_PREFIX + "io.threads";
														
 
															+
														
 
															+  /**
														
 
															+   * Default value:  {@value}.
														
 
															+   */
														
 
															+  public static final int OPT_IO_PROCESSORS_DEFAULT = 64;
														
 
															+
														
 
															+  /**
														
 
															+   * Directory for saving job summary reports.
														
 
															+   * These are the _SUCCESS files, but are saved even on
														
 
															+   * job failures.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String OPT_SUMMARY_REPORT_DIR =
														
 
															+      OPT_PREFIX + "summary.report.directory";
														
 
															+
														
 
															+  /**
														
 
															+   * Directory for moving manifests under for diagnostics.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String OPT_DIAGNOSTICS_MANIFEST_DIR =
														
 
															+      OPT_PREFIX + "diagnostics.manifest.directory";
														
 
															+
														
 
															+  /**
														
 
															+   * Should the output be validated?
														
 
															+   * This will check expected vs actual file lengths, and,
														
 
															+   * if etags can be obtained, etags.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String OPT_VALIDATE_OUTPUT = OPT_PREFIX + "validate.output";
														
 
															+
														
 
															+  /**
														
 
															+   * Default value: {@value}.
														
 
															+   */
														
 
															+  public static final boolean OPT_VALIDATE_OUTPUT_DEFAULT = false;
														
 
															+
														
 
															+  /**
														
 
															+   * Should job commit delete for files/directories at the targets
														
 
															+   * of renames, and, if found, deleting them?
														
 
															+   *
														
 
															+   * This is part of the effective behavior of the FileOutputCommitter,
														
 
															+   * however it adds an extra delete call per file being committed.
														
 
															+   *
														
 
															+   * If a job is writing to a directory which has only just been created
														
 
															+   * or were unique filenames are being used, there is no need to perform
														
 
															+   * this preparation.
														
 
															+   * The recognition of newly created dirs is automatic.
														
 
															+   *
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String OPT_DELETE_TARGET_FILES =
														
 
															+      OPT_PREFIX + "delete.target.files";
														
 
															+
														
 
															+  /**
														
 
															+   * Default value: {@value}.
														
 
															+   */
														
 
															+  public static final boolean OPT_DELETE_TARGET_FILES_DEFAULT = false;
														
 
															+
														
 
															+  /**
														
 
															+   * Name of the factory: {@value}.
														
 
															+   */
														
 
															+  public static final String MANIFEST_COMMITTER_FACTORY =
														
 
															+      ManifestCommitterFactory.class.getName();
														
 
															+
														
 
															+  /**
														
 
															+   * Classname of the store operations; filesystems and tests
														
 
															+   * may override.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String OPT_STORE_OPERATIONS_CLASS = OPT_PREFIX + "store.operations.classname";
														
 
															+
														
 
															+  /**
														
 
															+   * Default classname of the store operations.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String STORE_OPERATIONS_CLASS_DEFAULT =
														
 
															+      ManifestStoreOperationsThroughFileSystem.class.getName();
														
 
															+
														
 
															+  /**
														
 
															+   * Stage attribute in audit context: {@value}.
														
 
															+   */
														
 
															+  public static final String CONTEXT_ATTR_STAGE = "st";
														
 
															+
														
 
															+  /**
														
 
															+   * Task ID attribute in audit context: {@value}.
														
 
															+   */
														
 
															+  public static final String CONTEXT_ATTR_TASK_ATTEMPT_ID = "ta";
														
 
															+
														
 
															+  private ManifestCommitterConstants() {
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterFactory.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterFactory.java
@@ -0,0 +1,57 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.PathIOException;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptContext;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.UNSUPPORTED_FS_SCHEMAS;
														
 
															+
														
 
															+/**
														
 
															+ * This is the committer factory to register as the source of committers
														
 
															+ * for the job/filesystem schema.
														
 
															+ */
														
 
															+@InterfaceAudience.Public
														
 
															+@InterfaceStability.Evolving
														
 
															+public class ManifestCommitterFactory extends PathOutputCommitterFactory {
														
 
															+
														
 
															+  /**
														
 
															+   * Name of this factory.
														
 
															+   */
														
 
															+  public static final String NAME = ManifestCommitterFactory.class.getName();
														
 
															+
														
 
															+  @Override
														
 
															+  public ManifestCommitter createOutputCommitter(final Path outputPath,
														
 
															+      final TaskAttemptContext context) throws IOException {
														
 
															+    // safety check. S3A does not support this, so fail fast.
														
 
															+    final String scheme = outputPath.toUri().getScheme();
														
 
															+    if (UNSUPPORTED_FS_SCHEMAS.contains(scheme)) {
														
 
															+      throw new PathIOException(outputPath.toString(),
														
 
															+          "This committer does not work with the filesystem of type " + scheme);
														
 
															+    }
														
 
															+    return new ManifestCommitter(outputPath, context);
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterStatisticNames.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterStatisticNames.java
@@ -0,0 +1,264 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.statistics.StoreStatisticNames;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_RENAME;
														
 
															+
														
 
															+/**
														
 
															+ * Statistic names for committers.
														
 
															+ * Please keep in sync with org.apache.hadoop.fs.s3a.Statistic
														
 
															+ * so that S3A and manifest committers are in sync.
														
 
															+ */
														
 
															+@InterfaceAudience.Public
														
 
															+@InterfaceStability.Unstable
														
 
															+public final class ManifestCommitterStatisticNames {
														
 
															+
														
 
															+
														
 
															+  /** Amount of data committed: {@value}. */
														
 
															+  public static final String COMMITTER_BYTES_COMMITTED_COUNT =
														
 
															+      "committer_bytes_committed";
														
 
															+
														
 
															+  /** Duration Tracking of time to commit an entire job: {@value}. */
														
 
															+  public static final String COMMITTER_COMMIT_JOB =
														
 
															+      "committer_commit_job";
														
 
															+
														
 
															+  /** Number of files committed: {@value}. */
														
 
															+  public static final String COMMITTER_FILES_COMMITTED_COUNT =
														
 
															+      "committer_files_committed";
														
 
															+
														
 
															+  /** "Count of successful tasks:: {@value}. */
														
 
															+  public static final String COMMITTER_TASKS_COMPLETED_COUNT =
														
 
															+      "committer_tasks_completed";
														
 
															+
														
 
															+  /** Count of failed tasks: {@value}. */
														
 
															+  public static final String COMMITTER_TASKS_FAILED_COUNT =
														
 
															+      "committer_tasks_failed";
														
 
															+
														
 
															+  /** Count of commits aborted: {@value}. */
														
 
															+  public static final String COMMITTER_COMMITS_ABORTED_COUNT =
														
 
															+      "committer_commits_aborted";
														
 
															+
														
 
															+  /** Count of commits reverted: {@value}. */
														
 
															+  public static final String COMMITTER_COMMITS_REVERTED_COUNT =
														
 
															+      "committer_commits_reverted";
														
 
															+
														
 
															+  /** Count of commits failed: {@value}. */
														
 
															+  public static final String COMMITTER_COMMITS_FAILED =
														
 
															+      "committer_commits" + StoreStatisticNames.SUFFIX_FAILURES;
														
 
															+
														
 
															+  /**
														
 
															+   * The number of files in a task. This will be a MeanStatistic.
														
 
															+   */
														
 
															+  public static final String COMMITTER_FILE_COUNT_MEAN =
														
 
															+      "committer_task_file_count";
														
 
															+
														
 
															+  /**
														
 
															+   * File Size.
														
 
															+   */
														
 
															+  public static final String COMMITTER_FILE_SIZE_MEAN =
														
 
															+      "committer_task_file_size";
														
 
															+
														
 
															+  /**
														
 
															+   * What is a task attempt's directory count.
														
 
															+   */
														
 
															+  public static final String COMMITTER_TASK_DIRECTORY_COUNT_MEAN =
														
 
															+      "committer_task_directory_count";
														
 
															+
														
 
															+  /**
														
 
															+   * What is the depth of a task attempt's directory tree.
														
 
															+   */
														
 
															+  public static final String COMMITTER_TASK_DIRECTORY_DEPTH_MEAN =
														
 
															+      "committer_task_directory_depth";
														
 
															+
														
 
															+  /**
														
 
															+   * The number of files in a task. This will be a MeanStatistic.
														
 
															+   */
														
 
															+  public static final String COMMITTER_TASK_FILE_COUNT_MEAN =
														
 
															+      "committer_task_file_count";
														
 
															+
														
 
															+  /**
														
 
															+   * The number of files in a task. This will be a MeanStatistic.
														
 
															+   */
														
 
															+  public static final String COMMITTER_TASK_FILE_SIZE_MEAN =
														
 
															+      "committer_task_file_size";
														
 
															+
														
 
															+  /**
														
 
															+   * The size of manifest files. This will be a MeanStatistic.
														
 
															+   * Useful to highlight whether this size is too large and
														
 
															+   * effort to optimise for file IO and memory consumption
														
 
															+   * justified.
														
 
															+   */
														
 
															+  public static final String COMMITTER_TASK_MANIFEST_FILE_SIZE =
														
 
															+      "committer_task_manifest_file_size";
														
 
															+
														
 
															+  /**
														
 
															+   * A file renamed during the commit operation {@value}.
														
 
															+   */
														
 
															+  public static final String OP_COMMIT_FILE_RENAME =
														
 
															+      "commit_file_rename";
														
 
															+
														
 
															+  /**
														
 
															+   * A file rename during the commit operation was recovered from a failure {@value}.
														
 
															+   */
														
 
															+  public static final String OP_COMMIT_FILE_RENAME_RECOVERED =
														
 
															+      "commit_file_rename_recovered";
														
 
															+
														
 
															+  /** Directory creation {@value}. */
														
 
															+  public static final String OP_CREATE_DIRECTORIES = "op_create_directories";
														
 
															+
														
 
															+  /** Creating a single directory {@value}. */
														
 
															+  public static final String OP_CREATE_ONE_DIRECTORY =
														
 
															+      "op_create_one_directory";
														
 
															+
														
 
															+  /**
														
 
															+   * A file in the destination directory tree has been deleted.
														
 
															+   *  {@value}.
														
 
															+   */
														
 
															+  public static final String OP_DELETE_FILE_UNDER_DESTINATION =
														
 
															+      "op_delete_file_under_destination";
														
 
															+
														
 
															+  /** Directory scan {@value}. */
														
 
															+  public static final String OP_DIRECTORY_SCAN = "op_directory_scan";
														
 
															+
														
 
															+  /**
														
 
															+   * Overall job commit {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_JOB_COMMIT = COMMITTER_COMMIT_JOB;
														
 
															+
														
 
															+  /** {@value}. */
														
 
															+  public static final String OP_LOAD_ALL_MANIFESTS = "op_load_all_manifests";
														
 
															+
														
 
															+  /**
														
 
															+   * Load a task manifest: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_LOAD_MANIFEST = "op_load_manifest";
														
 
															+
														
 
															+  /**
														
 
															+   * mkdir failure statistic: {@value}.
														
 
															+   * This is incremented whenever mkdir() returns false, e.g because
														
 
															+   * there was a file at the end of the path.
														
 
															+   */
														
 
															+  public static final String OP_MKDIRS_RETURNED_FALSE = "op_mkdir_returned_false";
														
 
															+
														
 
															+  /**
														
 
															+   * msync statistic: {@value}.
														
 
															+   * This should be the same as StoreStatisticNames.OP_MSYNC; it's
														
 
															+   * duplicated just to isolate this committer into a single JAR
														
 
															+   * for ease of testing.
														
 
															+   */
														
 
															+  public static final String OP_MSYNC = "op_msync";
														
 
															+
														
 
															+  /**
														
 
															+   * Preparing ancestor dirs: {@value}.
														
 
															+   * Probing for paths being files, and if true: deleting them.
														
 
															+   */
														
 
															+  public static final String OP_PREPARE_DIR_ANCESTORS = "op_prepare_dir_ancestors";
														
 
															+
														
 
															+  /** Rename a dir: {@value}. */
														
 
															+  public static final String OP_RENAME_DIR = OP_RENAME;
														
 
															+
														
 
															+
														
 
															+  /** Rename a file: {@value}. */
														
 
															+  public static final String OP_RENAME_FILE = OP_RENAME;
														
 
															+
														
 
															+  /**
														
 
															+   * Save a task manifest: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_SAVE_TASK_MANIFEST =
														
 
															+      "task_stage_save_task_manifest";
														
 
															+
														
 
															+  /**
														
 
															+   * Task abort: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_TASK_ABORT_TASK
														
 
															+      = "task_stage_abort_task";
														
 
															+
														
 
															+  /**
														
 
															+   * Job abort: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_JOB_ABORT = "job_stage_abort";
														
 
															+
														
 
															+  /**
														
 
															+   * Job cleanup: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_JOB_CLEANUP = "job_stage_cleanup";
														
 
															+
														
 
															+  /**
														
 
															+   * Prepare Directories Stage: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_JOB_CREATE_TARGET_DIRS =
														
 
															+      "job_stage_create_target_dirs";
														
 
															+
														
 
															+  /**
														
 
															+   * Load Manifest Stage: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_JOB_LOAD_MANIFESTS =
														
 
															+      "job_stage_load_manifests";
														
 
															+
														
 
															+  /**
														
 
															+   * Rename files stage duration: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_JOB_RENAME_FILES =
														
 
															+      "job_stage_rename_files";
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Job Setup Stage: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_JOB_SETUP = "job_stage_setup";
														
 
															+
														
 
															+  /**
														
 
															+   * Job saving _SUCCESS marker Stage: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_JOB_SAVE_SUCCESS =
														
 
															+      "job_stage_save_success_marker";
														
 
															+
														
 
															+  /**
														
 
															+   * Output Validation (within job commit) Stage: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_JOB_VALIDATE_OUTPUT =
														
 
															+      "job_stage_optional_validate_output";
														
 
															+
														
 
															+  /**
														
 
															+   * Task saving manifest file Stage: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_TASK_SAVE_MANIFEST =
														
 
															+      "task_stage_save_manifest";
														
 
															+
														
 
															+  /**
														
 
															+   * Task Setup Stage: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_TASK_SETUP = "task_stage_setup";
														
 
															+
														
 
															+  /**
														
 
															+   * Task Commit Stage: {@value}.
														
 
															+   */
														
 
															+  public static final String OP_STAGE_TASK_COMMIT = "task_stage_commit";
														
 
															+
														
 
															+  /** Task Scan directory Stage: {@value}. */
														
 
															+  public static final String OP_STAGE_TASK_SCAN_DIRECTORY
														
 
															+      = "task_stage_scan_directory";
														
 
															+
														
 
															+  private ManifestCommitterStatisticNames() {
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/AbstractManifestData.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/AbstractManifestData.java
@@ -0,0 +1,137 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
														
 
															+
														
 
															+import javax.annotation.Nullable;
														
 
															+import java.io.IOException;
														
 
															+import java.io.Serializable;
														
 
															+import java.net.URI;
														
 
															+import java.net.URISyntaxException;
														
 
															+
														
 
															+import com.fasterxml.jackson.annotation.JsonInclude;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
														
 
															+import org.apache.hadoop.util.JsonSerialization;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+
														
 
															+/**
														
 
															+ * Class for single/multiple commit data structures.
														
 
															+ */
														
 
															+@SuppressWarnings("serial")
														
 
															+@InterfaceAudience.Private
														
 
															+@InterfaceStability.Unstable
														
 
															+@JsonInclude(JsonInclude.Include.NON_NULL)
														
 
															+public abstract class AbstractManifestData<T extends AbstractManifestData>
														
 
															+    implements Serializable, IOStatisticsSource {
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Convert a path to a string which can be included in the JSON.
														
 
															+   * @param path path
														
 
															+   * @return a string value, or, if path==null, null.
														
 
															+   */
														
 
															+  public static String marshallPath(@Nullable Path path) {
														
 
															+    return path != null
														
 
															+        ? path.toUri().toString()
														
 
															+        : null;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Convert a string path to Path type, by way of a URI.
														
 
															+   * @param path path as a string
														
 
															+   * @return path value
														
 
															+   * @throws RuntimeException marshalling failure.
														
 
															+   */
														
 
															+  public static Path unmarshallPath(String path) {
														
 
															+    try {
														
 
															+      return new Path(new URI(requireNonNull(path, "No path")));
														
 
															+    } catch (URISyntaxException e) {
														
 
															+      throw new RuntimeException(
														
 
															+          "Failed to parse \"" + path + "\" : " + e,
														
 
															+          e);
														
 
															+
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Validate the data: those fields which must be non empty, must be set.
														
 
															+   * @return the validated instance.
														
 
															+   * @throws IOException if the data is invalid
														
 
															+   */
														
 
															+  public abstract T validate() throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Serialize to JSON and then to a byte array, after performing a
														
 
															+   * preflight validation of the data to be saved.
														
 
															+   * @return the data in a persistable form.
														
 
															+   * @throws IOException serialization problem or validation failure.
														
 
															+   */
														
 
															+  public abstract byte[] toBytes() throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Save to a hadoop filesystem.
														
 
															+   * @param fs filesystem
														
 
															+   * @param path path
														
 
															+   * @param overwrite should any existing file be overwritten
														
 
															+   * @throws IOException IO exception
														
 
															+   */
														
 
															+  public abstract void save(FileSystem fs, Path path, boolean overwrite)
														
 
															+      throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Get a (usually shared) JSON serializer.
														
 
															+   * @return a serializer. Call
														
 
															+   */
														
 
															+  public abstract JsonSerialization<T> createSerializer();
														
 
															+
														
 
															+  /**
														
 
															+   * Verify that all instances in a collection are of the given class.
														
 
															+   * @param it iterator
														
 
															+   * @param classname classname to require
														
 
															+   * @throws IOException on a failure
														
 
															+   */
														
 
															+  void validateCollectionClass(Iterable it, Class classname)
														
 
															+      throws IOException {
														
 
															+    for (Object o : it) {
														
 
															+      verify(o.getClass().equals(classname),
														
 
															+          "Collection element is not a %s: %s", classname, o.getClass());
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify that a condition holds.
														
 
															+   * @param expression expression which must be true
														
 
															+   * @param message message to raise on a failure
														
 
															+   * @param args arguments for the message formatting
														
 
															+   * @throws IOException on a failure
														
 
															+   */
														
 
															+
														
 
															+  static void verify(boolean expression,
														
 
															+      String message,
														
 
															+      Object... args) throws IOException {
														
 
															+    if (!expression) {
														
 
															+      throw new IOException(String.format(message, args));
														
 
															+    }
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/DiagnosticKeys.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/DiagnosticKeys.java
@@ -0,0 +1,44 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+
														
 
															+/**
														
 
															+ * Diagnostic keys in the manifests.
														
 
															+ */
														
 
															+@InterfaceAudience.Public
														
 
															+@InterfaceStability.Unstable
														
 
															+public final class DiagnosticKeys {
														
 
															+  /**
														
 
															+   * Attribute added to diagnostics in _SUCCESS file.
														
 
															+   */
														
 
															+  public static final String PRINCIPAL = "principal";
														
 
															+  public static final String STAGE = "stage";
														
 
															+  public static final String EXCEPTION = "exception";
														
 
															+  public static final String STACKTRACE = "stacktrace";
														
 
															+
														
 
															+
														
 
															+  /** Directory where manifests were renamed: {@value}. */
														
 
															+  public static final String MANIFESTS = "manifests";
														
 
															+
														
 
															+  private DiagnosticKeys() {
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/DirEntry.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/DirEntry.java
@@ -0,0 +1,202 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.io.Serializable;
														
 
															+import java.util.Objects;
														
 
															+
														
 
															+import com.fasterxml.jackson.annotation.JsonIgnore;
														
 
															+import com.fasterxml.jackson.annotation.JsonProperty;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.marshallPath;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.unmarshallPath;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.verify;
														
 
															+
														
 
															+/**
														
 
															+ * A directory entry in the task manifest.
														
 
															+ * Uses shorter field names for smaller files.
														
 
															+ * Hash and equals are on dir name only; there's no real expectation
														
 
															+ * that those operations are needed.
														
 
															+ */
														
 
															+@InterfaceAudience.Public
														
 
															+@InterfaceStability.Unstable
														
 
															+public final class DirEntry implements Serializable {
														
 
															+
														
 
															+  private static final long serialVersionUID = 5658520530209859765L;
														
 
															+
														
 
															+  /**
														
 
															+   * Destination directory.
														
 
															+   */
														
 
															+  @JsonProperty("d")
														
 
															+  private String dir;
														
 
															+
														
 
															+  /**
														
 
															+   * Type of dest entry as found when probed for in task commit.
														
 
															+   */
														
 
															+  @JsonProperty("t")
														
 
															+  private int type;
														
 
															+
														
 
															+  /**
														
 
															+   * Level in the treewalk.
														
 
															+   */
														
 
															+  @JsonProperty("l")
														
 
															+  private int level;
														
 
															+
														
 
															+  /**
														
 
															+   * Constructor only for use by jackson.
														
 
															+   * Do Not Delete.
														
 
															+   */
														
 
															+  private DirEntry() {
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Construct an entry.
														
 
															+   *
														
 
															+   * @param dir destination path.
														
 
															+   * @param type type of dest entry
														
 
															+   * @param level Level in the treewalk.
														
 
															+   *
														
 
															+   */
														
 
															+  public DirEntry(
														
 
															+      final String dir,
														
 
															+      final int type,
														
 
															+      final int level) {
														
 
															+    this.dir = requireNonNull(dir);
														
 
															+    this.type = type;
														
 
															+    this.level = level;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Construct an entry.
														
 
															+   *
														
 
															+   * @param dir destination path.
														
 
															+   * @param type type of dest entry
														
 
															+   * @param level Level in the treewalk.
														
 
															+   *
														
 
															+   */
														
 
															+  public DirEntry(
														
 
															+      final Path dir,
														
 
															+      final int type,
														
 
															+      final int level) {
														
 
															+    this(marshallPath(dir), type, level);
														
 
															+  }
														
 
															+
														
 
															+  public void setDir(final String dir) {
														
 
															+    this.dir = dir;
														
 
															+  }
														
 
															+
														
 
															+  public String getDir() {
														
 
															+    return dir;
														
 
															+  }
														
 
															+
														
 
															+  @JsonIgnore
														
 
															+  public Path getDestPath() {
														
 
															+    return unmarshallPath(dir);
														
 
															+  }
														
 
															+
														
 
															+  public int getType() {
														
 
															+    return type;
														
 
															+  }
														
 
															+
														
 
															+  public void setType(final int type) {
														
 
															+    this.type = type;
														
 
															+  }
														
 
															+
														
 
															+  public void setLevel(final int level) {
														
 
															+    this.level = level;
														
 
															+  }
														
 
															+
														
 
															+  public int getLevel() {
														
 
															+    return level;
														
 
															+  }
														
 
															+
														
 
															+  @JsonIgnore
														
 
															+  public EntryStatus getStatus() {
														
 
															+    return EntryStatus.toEntryStatus(type);
														
 
															+  }
														
 
															+
														
 
															+  @JsonIgnore
														
 
															+  public void setStatus(EntryStatus status) {
														
 
															+    setType(status.ordinal());
														
 
															+  }
														
 
															+  public void validate() throws IOException {
														
 
															+    final String s = toString();
														
 
															+    verify(dir != null && dir.length() > 0,
														
 
															+        "destination path is missing from " + s);
														
 
															+    verify(type >= 0,
														
 
															+        "Invalid type in " + s);
														
 
															+    verify(level >= 0,
														
 
															+        "Invalid level in " + s);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public String toString() {
														
 
															+    return "DirEntry{" +
														
 
															+        "dir='" + dir + '\'' +
														
 
															+        ", type=" + type +
														
 
															+        ", level=" + level +
														
 
															+        '}';
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean equals(final Object o) {
														
 
															+    if (this == o) {
														
 
															+      return true;
														
 
															+    }
														
 
															+    if (o == null || getClass() != o.getClass()) {
														
 
															+      return false;
														
 
															+    }
														
 
															+    DirEntry dirEntry = (DirEntry) o;
														
 
															+    return dir.equals(dirEntry.dir);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public int hashCode() {
														
 
															+    return Objects.hash(dir);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * A directory entry.
														
 
															+   * @param dest destination path.
														
 
															+   * @param type type
														
 
															+   * @param level Level in the treewalk.
														
 
															+   * @return an entry
														
 
															+   */
														
 
															+  public static DirEntry dirEntry(Path dest, int type, int level) {
														
 
															+    return new DirEntry(dest, type, level);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * A directory entry.
														
 
															+   * @param dest destination path.
														
 
															+   * @param type type
														
 
															+   * @param level Level in the treewalk.
														
 
															+   * @return an entry
														
 
															+   */
														
 
															+  public static DirEntry dirEntry(Path dest, EntryStatus type, int level) {
														
 
															+    return dirEntry(dest, type.ordinal(), level);
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/EntryStatus.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/EntryStatus.java
@@ -0,0 +1,87 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
														
 
															+
														
 
															+import javax.annotation.Nullable;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+
														
 
															+/**
														
 
															+ * Status of a file or dir entry, designed to be marshalled as
														
 
															+ * an integer -the ordinal value of the enum is the
														
 
															+ * wire value.
														
 
															+ */
														
 
															+@InterfaceAudience.Private
														
 
															+@InterfaceStability.Unstable
														
 
															+public enum EntryStatus {
														
 
															+
														
 
															+  unknown,
														
 
															+  not_found,
														
 
															+  file,
														
 
															+  dir,
														
 
															+  created_dir;
														
 
															+
														
 
															+  /**
														
 
															+   * Go from a marshalled type to a status value.
														
 
															+   * Any out of range value is converted to unknown.
														
 
															+   * @param type type
														
 
															+   * @return the status value.
														
 
															+   */
														
 
															+  public static EntryStatus toEntryStatus(int type) {
														
 
															+    switch (type) {
														
 
															+    case 1:
														
 
															+      return not_found;
														
 
															+    case 2:
														
 
															+      return file;
														
 
															+    case 3:
														
 
															+      return dir;
														
 
															+    case 4:
														
 
															+      return created_dir;
														
 
															+    case 0:
														
 
															+    default:
														
 
															+      return unknown;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Go from the result of a getFileStatus call or
														
 
															+   * listing entry to a status.
														
 
															+   * A null argument is mapped to {@link #not_found}
														
 
															+   * @param st file status
														
 
															+   * @return the status enum.
														
 
															+   */
														
 
															+  public static EntryStatus toEntryStatus(@Nullable FileStatus st) {
														
 
															+
														
 
															+    if (st == null) {
														
 
															+      return not_found;
														
 
															+    }
														
 
															+    if (st.isDirectory()) {
														
 
															+      return dir;
														
 
															+    }
														
 
															+    if (st.isFile()) {
														
 
															+      return file;
														
 
															+    }
														
 
															+    return unknown;
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/FileEntry.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/FileEntry.java
@@ -0,0 +1,189 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.io.Serializable;
														
 
															+import java.util.Objects;
														
 
															+
														
 
															+import com.fasterxml.jackson.annotation.JsonIgnore;
														
 
															+import com.fasterxml.jackson.annotation.JsonInclude;
														
 
															+import com.fasterxml.jackson.annotation.JsonProperty;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.marshallPath;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.unmarshallPath;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.verify;
														
 
															+
														
 
															+/**
														
 
															+ * A File entry in the task manifest.
														
 
															+ * Uses shorter field names for smaller files.
														
 
															+ */
														
 
															+
														
 
															+@InterfaceAudience.Private
														
 
															+@InterfaceStability.Unstable
														
 
															+@JsonInclude(JsonInclude.Include.NON_NULL)
														
 
															+public final class FileEntry implements Serializable {
														
 
															+
														
 
															+  private static final long serialVersionUID = -550288489009777867L;
														
 
															+
														
 
															+  @JsonProperty("s")
														
 
															+  private String source;
														
 
															+
														
 
															+  @JsonProperty("d")
														
 
															+  private String dest;
														
 
															+
														
 
															+  @JsonProperty("z")
														
 
															+  private long size;
														
 
															+
														
 
															+  /**
														
 
															+   * Etag value if we can extract this.
														
 
															+   */
														
 
															+  @JsonProperty("e")
														
 
															+  private String etag;
														
 
															+
														
 
															+  /**
														
 
															+   * Constructor only for use by jackson.
														
 
															+   * Do Not Delete.
														
 
															+   */
														
 
															+  private FileEntry() {
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Construct an entry.
														
 
															+   * @param source source path.
														
 
															+   * @param dest destination path.
														
 
															+   * @param size file size.
														
 
															+   * @param etag optional etag
														
 
															+   */
														
 
															+  public FileEntry(
														
 
															+      final String source,
														
 
															+      final String dest,
														
 
															+      final long size,
														
 
															+      final String etag) {
														
 
															+    this.source = source;
														
 
															+    this.dest = dest;
														
 
															+    this.size = size;
														
 
															+    this.etag = etag;
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Construct an entry.
														
 
															+   * @param source source path.
														
 
															+   * @param dest destination path.
														
 
															+   * @param size file size.
														
 
															+   * @param etag optional etag
														
 
															+   */
														
 
															+  public FileEntry(
														
 
															+      final Path source,
														
 
															+      final Path dest,
														
 
															+      final long size,
														
 
															+      final String etag) {
														
 
															+    this(marshallPath(source), marshallPath(dest), size, etag);
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  public void setSource(final String source) {
														
 
															+    this.source = source;
														
 
															+  }
														
 
															+
														
 
															+  public String getSource() {
														
 
															+    return source;
														
 
															+  }
														
 
															+
														
 
															+  @JsonIgnore
														
 
															+  public Path getSourcePath() {
														
 
															+    return unmarshallPath(source);
														
 
															+  }
														
 
															+
														
 
															+  public void setDest(final String dest) {
														
 
															+    this.dest = dest;
														
 
															+  }
														
 
															+
														
 
															+  public String getDest() {
														
 
															+    return dest;
														
 
															+  }
														
 
															+
														
 
															+  @JsonIgnore
														
 
															+  public Path getDestPath() {
														
 
															+    return unmarshallPath(dest);
														
 
															+  }
														
 
															+
														
 
															+  public long getSize() {
														
 
															+    return size;
														
 
															+  }
														
 
															+
														
 
															+  public void setSize(final long size) {
														
 
															+    this.size = size;
														
 
															+  }
														
 
															+
														
 
															+  public String getEtag() {
														
 
															+    return etag;
														
 
															+  }
														
 
															+
														
 
															+  public void setEtag(final String etag) {
														
 
															+    this.etag = etag;
														
 
															+  }
														
 
															+
														
 
															+  public void validate() throws IOException {
														
 
															+    final String s = toString();
														
 
															+    verify(source != null && source.length() > 0,
														
 
															+        "Source is missing from " + s);
														
 
															+    verify(dest != null && dest.length() > 0,
														
 
															+        "Source is missing from " + s);
														
 
															+    verify(size >= 0,
														
 
															+        "Invalid size in " + s);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public String toString() {
														
 
															+    final StringBuilder sb = new StringBuilder(
														
 
															+        "FileOrDirEntry{");
														
 
															+    sb.append("source='").append(source).append('\'');
														
 
															+    sb.append(", dest='").append(dest).append('\'');
														
 
															+    sb.append(", size=").append(size);
														
 
															+    sb.append(", etag='").append(etag).append('\'');
														
 
															+    sb.append('}');
														
 
															+    return sb.toString();
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean equals(Object o) {
														
 
															+    if (this == o) {
														
 
															+      return true;
														
 
															+    }
														
 
															+    if (o == null || getClass() != o.getClass()) {
														
 
															+      return false;
														
 
															+    }
														
 
															+    FileEntry that = (FileEntry) o;
														
 
															+    return size == that.size && source.equals(that.source) && dest.equals(
														
 
															+        that.dest) &&
														
 
															+        Objects.equals(etag, that.etag);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public int hashCode() {
														
 
															+    return Objects.hash(source, dest);
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestPrinter.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestPrinter.java
@@ -0,0 +1,144 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.io.PrintStream;
														
 
															+
														
 
															+import org.apache.hadoop.conf.Configuration;
														
 
															+import org.apache.hadoop.conf.Configured;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.util.ExitUtil;
														
 
															+import org.apache.hadoop.util.Tool;
														
 
															+import org.apache.hadoop.util.ToolRunner;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
														
 
															+
														
 
															+/**
														
 
															+ * Tool to print a manifest.
														
 
															+ */
														
 
															+public class ManifestPrinter extends Configured implements Tool {
														
 
															+
														
 
															+  private static final String USAGE = "ManifestPrinter <success-file>";
														
 
															+
														
 
															+  /**
														
 
															+   * Output for printing.
														
 
															+   */
														
 
															+  private final PrintStream out;
														
 
															+
														
 
															+  /**
														
 
															+   * Print to System.out.
														
 
															+   */
														
 
															+  public ManifestPrinter() {
														
 
															+    this(null, System.out);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Print to the supplied stream.
														
 
															+   * @param conf configuration
														
 
															+   * @param out output
														
 
															+   */
														
 
															+  public ManifestPrinter(Configuration conf, PrintStream out) {
														
 
															+    super(conf);
														
 
															+    this.out = out;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public int run(String[] args) throws Exception {
														
 
															+    if (args.length != 1) {
														
 
															+      printUsage();
														
 
															+      return -1;
														
 
															+    }
														
 
															+    Path path = new Path(args[0]);
														
 
															+    loadAndPrintManifest(path.getFileSystem(getConf()), path);
														
 
															+    return 0;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load and print a manifest.
														
 
															+   * @param fs filesystem.
														
 
															+   * @param path path
														
 
															+   * @throws IOException failure to load
														
 
															+   * @return the manifest
														
 
															+   */
														
 
															+  public ManifestSuccessData loadAndPrintManifest(FileSystem fs, Path path)
														
 
															+      throws IOException {
														
 
															+    // load the manifest
														
 
															+    println("Manifest file: %s", path);
														
 
															+    final ManifestSuccessData success = ManifestSuccessData.load(fs, path);
														
 
															+
														
 
															+    printManifest(success);
														
 
															+    return success;
														
 
															+  }
														
 
															+
														
 
															+  private void printManifest(ManifestSuccessData success) {
														
 
															+    field("succeeded", success.getSuccess());
														
 
															+    field("created", success.getDate());
														
 
															+    field("committer", success.getCommitter());
														
 
															+    field("hostname", success.getHostname());
														
 
															+    field("description", success.getDescription());
														
 
															+    field("jobId", success.getJobId());
														
 
															+    field("jobIdSource", success.getJobIdSource());
														
 
															+    field("stage", success.getStage());
														
 
															+    println("Diagnostics\n%s",
														
 
															+        success.dumpDiagnostics("  ", " = ", "\n"));
														
 
															+    println("Statistics:\n%s",
														
 
															+        ioStatisticsToPrettyString(success.getIOStatistics()));
														
 
															+    out.flush();
														
 
															+  }
														
 
															+
														
 
															+  private void printUsage() {
														
 
															+    println(USAGE);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Print a line to the output stream.
														
 
															+   * @param format format string
														
 
															+   * @param args arguments.
														
 
															+   */
														
 
															+  private void println(String format, Object... args) {
														
 
															+    out.format(format, args);
														
 
															+    out.println();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Print a field, if non-null.
														
 
															+   * @param name field name.
														
 
															+   * @param value value.
														
 
															+   */
														
 
															+  private void field(String name, Object value) {
														
 
															+    if (value != null) {
														
 
															+      println("%s: %s", name, value);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Entry point.
														
 
															+   */
														
 
															+  public static void main(String[] argv) throws Exception {
														
 
															+
														
 
															+    try {
														
 
															+      int res = ToolRunner.run(new ManifestPrinter(), argv);
														
 
															+      System.exit(res);
														
 
															+    } catch (ExitUtil.ExitException e) {
														
 
															+      ExitUtil.terminate(e);
														
 
															+    }
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestSuccessData.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestSuccessData.java
@@ -0,0 +1,493 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.Collections;
														
 
															+import java.util.List;
														
 
															+import java.util.Map;
														
 
															+import java.util.TreeMap;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+import com.fasterxml.jackson.annotation.JsonIgnore;
														
 
															+import com.fasterxml.jackson.annotation.JsonInclude;
														
 
															+import com.fasterxml.jackson.annotation.JsonProperty;
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.commons.lang3.StringUtils;
														
 
															+import org.apache.commons.lang3.exception.ExceptionUtils;
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatistics;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSupport;
														
 
															+import org.apache.hadoop.util.JsonSerialization;
														
 
															+
														
 
															+/**
														
 
															+ * Summary data saved into a {@code _SUCCESS} marker file.
														
 
															+ *
														
 
															+ * This is a copy of the S3A committer success data format, with
														
 
															+ * a goal of being/remaining compatible.
														
 
															+ * This makes it easier for tests in downstream modules to
														
 
															+ * be able to parse the success files from any of the committers.
														
 
															+ *
														
 
															+ * This should be considered public; it is based on the S3A
														
 
															+ * format, which has proven stable over time.
														
 
															+ *
														
 
															+ * The JSON format SHOULD be considered public and evolving
														
 
															+ * with compatibility across versions.
														
 
															+ *
														
 
															+ * All the Java serialization data is different and may change
														
 
															+ * across versions with no stability guarantees other than
														
 
															+ * "manifest summaries MAY be serialized between processes with
														
 
															+ * the exact same version of this binary on their classpaths."
														
 
															+ * That is sufficient for testing in Spark.
														
 
															+ *
														
 
															+ * To aid with Java serialization, the maps and lists are
														
 
															+ * exclusively those which serialize well.
														
 
															+ * IOStatisticsSnapshot has a lot of complexity in marshalling
														
 
															+ * there; this class doesn't worry about concurrent access
														
 
															+ * so is simpler.
														
 
															+ *
														
 
															+ */
														
 
															+@SuppressWarnings({"unused", "CollectionDeclaredAsConcreteClass"})
														
 
															+@InterfaceAudience.Public
														
 
															+@InterfaceStability.Unstable
														
 
															+@JsonInclude(JsonInclude.Include.NON_NULL)
														
 
															+public class ManifestSuccessData
														
 
															+    extends AbstractManifestData<ManifestSuccessData> {
														
 
															+
														
 
															+  private static final Logger LOG =
														
 
															+      LoggerFactory.getLogger(ManifestSuccessData.class);
														
 
															+
														
 
															+  /**
														
 
															+   * Supported version value: {@value}.
														
 
															+   * If this is changed the value of {@link #serialVersionUID} will change,
														
 
															+   * to avoid deserialization problems.
														
 
															+   */
														
 
															+  public static final int VERSION = 1;
														
 
															+
														
 
															+  /**
														
 
															+   * Serialization ID: {@value}.
														
 
															+   */
														
 
															+  private static final long serialVersionUID = 4755993198698104084L + VERSION;
														
 
															+
														
 
															+  /**
														
 
															+   * Name to include in persisted data, so as to differentiate from
														
 
															+   * any other manifests: {@value}.
														
 
															+   */
														
 
															+  public static final String NAME
														
 
															+      = "org.apache.hadoop.fs.s3a.commit.files.SuccessData/" + VERSION;
														
 
															+
														
 
															+  /**
														
 
															+   * Name of file; includes version marker.
														
 
															+   */
														
 
															+  private String name;
														
 
															+
														
 
															+  /** Timestamp of creation. */
														
 
															+  private long timestamp;
														
 
															+
														
 
															+  /**
														
 
															+   * Did this succeed?
														
 
															+   * It is implicitly true in a _SUCCESS file, but if the file
														
 
															+   * is also saved to a log dir, then it depends on the outcome
														
 
															+   */
														
 
															+  private boolean success = true;
														
 
															+
														
 
															+  /** Timestamp as date string; no expectation of parseability. */
														
 
															+  private String date;
														
 
															+
														
 
															+  /**
														
 
															+   * Host which created the file (implicitly: committed the work).
														
 
															+   */
														
 
															+  private String hostname;
														
 
															+
														
 
															+  /**
														
 
															+   * Committer name.
														
 
															+   */
														
 
															+  private String committer;
														
 
															+
														
 
															+  /**
														
 
															+   * Description text.
														
 
															+   */
														
 
															+  private String description;
														
 
															+
														
 
															+  /** Job ID, if known. */
														
 
															+  private String jobId = "";
														
 
															+
														
 
															+  /**
														
 
															+   * Source of the job ID.
														
 
															+   */
														
 
															+  private String jobIdSource = "";
														
 
															+
														
 
															+  /**
														
 
															+   * Metrics.
														
 
															+   * Uses a treemap for serialization.
														
 
															+   */
														
 
															+  private TreeMap<String, Long> metrics = new TreeMap<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Diagnostics information.
														
 
															+   * Uses a treemap for serialization.
														
 
															+   */
														
 
															+  private TreeMap<String, String> diagnostics = new TreeMap<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Filenames in the commit.
														
 
															+   */
														
 
															+  private ArrayList<String> filenames = new ArrayList<>(0);
														
 
															+
														
 
															+  /**
														
 
															+   * IOStatistics.
														
 
															+   */
														
 
															+  @JsonProperty("iostatistics")
														
 
															+  private IOStatisticsSnapshot iostatistics = new IOStatisticsSnapshot();
														
 
															+
														
 
															+  /**
														
 
															+   * State (committed, aborted).
														
 
															+   */
														
 
															+  private String state;
														
 
															+
														
 
															+  /**
														
 
															+   * Stage: last stage executed.
														
 
															+   */
														
 
															+  private String stage;
														
 
															+
														
 
															+  @Override
														
 
															+  public ManifestSuccessData validate() throws IOException {
														
 
															+    verify(name != null,
														
 
															+        "Incompatible file format: no 'name' field");
														
 
															+    verify(NAME.equals(name),
														
 
															+        "Incompatible file format: " + name);
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public JsonSerialization<ManifestSuccessData> createSerializer() {
														
 
															+    return serializer();
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public byte[] toBytes() throws IOException {
														
 
															+    return serializer().toBytes(this);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * To JSON.
														
 
															+   * @return json string value.
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  public String toJson() throws IOException {
														
 
															+    return serializer().toJson(this);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void save(FileSystem fs, Path path, boolean overwrite)
														
 
															+      throws IOException {
														
 
															+    // always set the name field before being saved.
														
 
															+    name = NAME;
														
 
															+    serializer().save(fs, path, this, overwrite);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public String toString() {
														
 
															+    final StringBuilder sb = new StringBuilder(
														
 
															+        "ManifestSuccessData{");
														
 
															+    sb.append("committer='").append(committer).append('\'');
														
 
															+    sb.append(", hostname='").append(hostname).append('\'');
														
 
															+    sb.append(", description='").append(description).append('\'');
														
 
															+    sb.append(", date='").append(date).append('\'');
														
 
															+    sb.append(", filenames=[").append(
														
 
															+        StringUtils.join(filenames, ", "))
														
 
															+        .append("]");
														
 
															+    sb.append('}');
														
 
															+    return sb.toString();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Dump the metrics (if any) to a string.
														
 
															+   * The metrics are sorted for ease of viewing.
														
 
															+   * @param prefix prefix before every entry
														
 
															+   * @param middle string between key and value
														
 
															+   * @param suffix suffix to each entry
														
 
															+   * @return the dumped string
														
 
															+   */
														
 
															+  public String dumpMetrics(String prefix, String middle, String suffix) {
														
 
															+    return joinMap(metrics, prefix, middle, suffix);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Dump the diagnostics (if any) to a string.
														
 
															+   * @param prefix prefix before every entry
														
 
															+   * @param middle string between key and value
														
 
															+   * @param suffix suffix to each entry
														
 
															+   * @return the dumped string
														
 
															+   */
														
 
															+  public String dumpDiagnostics(String prefix, String middle, String suffix) {
														
 
															+    return joinMap(diagnostics, prefix, middle, suffix);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Join any map of string to value into a string, sorting the keys first.
														
 
															+   * @param map map to join
														
 
															+   * @param prefix prefix before every entry
														
 
															+   * @param middle string between key and value
														
 
															+   * @param suffix suffix to each entry
														
 
															+   * @return a string for reporting.
														
 
															+   */
														
 
															+  protected static String joinMap(Map<String, ?> map,
														
 
															+      String prefix,
														
 
															+      String middle, String suffix) {
														
 
															+    if (map == null) {
														
 
															+      return "";
														
 
															+    }
														
 
															+    List<String> list = new ArrayList<>(map.keySet());
														
 
															+    Collections.sort(list);
														
 
															+    StringBuilder sb = new StringBuilder(list.size() * 32);
														
 
															+    for (String k : list) {
														
 
															+      sb.append(prefix)
														
 
															+          .append(k)
														
 
															+          .append(middle)
														
 
															+          .append(map.get(k))
														
 
															+          .append(suffix);
														
 
															+    }
														
 
															+    return sb.toString();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load an instance from a file, then validate it.
														
 
															+   * @param fs filesystem
														
 
															+   * @param path path
														
 
															+   * @return the loaded instance
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  public static ManifestSuccessData load(FileSystem fs, Path path)
														
 
															+      throws IOException {
														
 
															+    LOG.debug("Reading success data from {}", path);
														
 
															+    ManifestSuccessData instance = serializer().load(fs, path);
														
 
															+    instance.validate();
														
 
															+    return instance;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get a JSON serializer for this class.
														
 
															+   * @return a serializer.
														
 
															+   */
														
 
															+  public static JsonSerialization<ManifestSuccessData> serializer() {
														
 
															+    return new JsonSerialization<>(ManifestSuccessData.class, false, true);
														
 
															+  }
														
 
															+
														
 
															+  public String getName() {
														
 
															+    return name;
														
 
															+  }
														
 
															+
														
 
															+  public void setName(String name) {
														
 
															+    this.name = name;
														
 
															+  }
														
 
															+
														
 
															+  /** @return timestamp of creation. */
														
 
															+  public long getTimestamp() {
														
 
															+    return timestamp;
														
 
															+  }
														
 
															+
														
 
															+  public void setTimestamp(long timestamp) {
														
 
															+    this.timestamp = timestamp;
														
 
															+  }
														
 
															+
														
 
															+  /** @return timestamp as date; no expectation of parseability. */
														
 
															+  public String getDate() {
														
 
															+    return date;
														
 
															+  }
														
 
															+
														
 
															+  public void setDate(String date) {
														
 
															+    this.date = date;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * @return host which created the file (implicitly: committed the work).
														
 
															+   */
														
 
															+  public String getHostname() {
														
 
															+    return hostname;
														
 
															+  }
														
 
															+
														
 
															+  public void setHostname(String hostname) {
														
 
															+    this.hostname = hostname;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * @return committer name.
														
 
															+   */
														
 
															+  public String getCommitter() {
														
 
															+    return committer;
														
 
															+  }
														
 
															+
														
 
															+  public void setCommitter(String committer) {
														
 
															+    this.committer = committer;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * @return any description text.
														
 
															+   */
														
 
															+  public String getDescription() {
														
 
															+    return description;
														
 
															+  }
														
 
															+
														
 
															+  public void setDescription(String description) {
														
 
															+    this.description = description;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * @return any metrics.
														
 
															+   */
														
 
															+  public Map<String, Long> getMetrics() {
														
 
															+    return metrics;
														
 
															+  }
														
 
															+
														
 
															+  public void setMetrics(TreeMap<String, Long> metrics) {
														
 
															+    this.metrics = metrics;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * @return a list of filenames in the commit.
														
 
															+   */
														
 
															+  public List<String> getFilenames() {
														
 
															+    return filenames;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the list of filenames as paths.
														
 
															+   * @return the paths.
														
 
															+   */
														
 
															+  @JsonIgnore
														
 
															+  public List<Path> getFilenamePaths() {
														
 
															+    return getFilenames().stream()
														
 
															+        .map(AbstractManifestData::unmarshallPath)
														
 
															+        .collect(Collectors.toList());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the list of filename paths.
														
 
															+   */
														
 
															+  @JsonIgnore
														
 
															+  public void setFilenamePaths(List<Path> paths) {
														
 
															+    setFilenames(new ArrayList<>(
														
 
															+        paths.stream()
														
 
															+            .map(AbstractManifestData::marshallPath)
														
 
															+            .collect(Collectors.toList())));
														
 
															+  }
														
 
															+
														
 
															+  public void setFilenames(ArrayList<String> filenames) {
														
 
															+    this.filenames = filenames;
														
 
															+  }
														
 
															+
														
 
															+  public Map<String, String> getDiagnostics() {
														
 
															+    return diagnostics;
														
 
															+  }
														
 
															+
														
 
															+  public void setDiagnostics(TreeMap<String, String> diagnostics) {
														
 
															+    this.diagnostics = diagnostics;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a diagnostics entry.
														
 
															+   * @param key name
														
 
															+   * @param value value
														
 
															+   */
														
 
															+  public void putDiagnostic(String key, String value) {
														
 
															+    diagnostics.put(key, value);
														
 
															+  }
														
 
															+
														
 
															+  /** @return Job ID, if known. */
														
 
															+  public String getJobId() {
														
 
															+    return jobId;
														
 
															+  }
														
 
															+
														
 
															+  public void setJobId(String jobId) {
														
 
															+    this.jobId = jobId;
														
 
															+  }
														
 
															+
														
 
															+  public String getJobIdSource() {
														
 
															+    return jobIdSource;
														
 
															+  }
														
 
															+
														
 
															+  public void setJobIdSource(final String jobIdSource) {
														
 
															+    this.jobIdSource = jobIdSource;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public IOStatisticsSnapshot getIOStatistics() {
														
 
															+    return iostatistics;
														
 
															+  }
														
 
															+
														
 
															+  public void setIOStatistics(final IOStatisticsSnapshot ioStatistics) {
														
 
															+    this.iostatistics = ioStatistics;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the IOStatistics to a snapshot of the source.
														
 
															+   * @param iostats. Statistics; may be null.
														
 
															+   */
														
 
															+  public void snapshotIOStatistics(IOStatistics iostats) {
														
 
															+    setIOStatistics(IOStatisticsSupport.snapshotIOStatistics(iostats));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the success flag.
														
 
															+   * @param success did the job succeed?
														
 
															+   */
														
 
															+  public void setSuccess(boolean success) {
														
 
															+    this.success = success;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the success flag.
														
 
															+   * @return did the job succeed?
														
 
															+   */
														
 
															+  public boolean getSuccess() {
														
 
															+    return success;
														
 
															+  }
														
 
															+
														
 
															+  public String getState() {
														
 
															+    return state;
														
 
															+  }
														
 
															+
														
 
															+  public void setState(String state) {
														
 
															+    this.state = state;
														
 
															+  }
														
 
															+
														
 
															+  public String getStage() {
														
 
															+    return stage;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Note a failure by setting success flag to false,
														
 
															+   * then add the exception to the diagnostics.
														
 
															+   * @param thrown throwable
														
 
															+   */
														
 
															+  public void recordJobFailure(Throwable thrown) {
														
 
															+    setSuccess(false);
														
 
															+    String stacktrace = ExceptionUtils.getStackTrace(thrown);
														
 
															+    diagnostics.put(DiagnosticKeys.EXCEPTION, thrown.toString());
														
 
															+    diagnostics.put(DiagnosticKeys.STACKTRACE, stacktrace);
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/TaskManifest.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/TaskManifest.java
@@ -0,0 +1,365 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
														
 
															+
														
 
															+import javax.annotation.Nullable;
														
 
															+import java.io.IOException;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.HashMap;
														
 
															+import java.util.HashSet;
														
 
															+import java.util.List;
														
 
															+import java.util.Map;
														
 
															+import java.util.Set;
														
 
															+
														
 
															+import com.fasterxml.jackson.annotation.JsonIgnore;
														
 
															+import com.fasterxml.jackson.annotation.JsonInclude;
														
 
															+import com.fasterxml.jackson.annotation.JsonProperty;
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
														
 
															+import org.apache.hadoop.util.JsonSerialization;
														
 
															+
														
 
															+/**
														
 
															+ * This is the manifest of files which were created by
														
 
															+ * this task attempt.
														
 
															+ *
														
 
															+ * Versioning:
														
 
															+ * In a rolling cluster update, MR or Spark workers deployed on a newer
														
 
															+ * node (and/or with a newer version of artifacts in a cluster-FS hosted
														
 
															+ * tar.gz file) may be a later version of this class than that of
														
 
															+ * job committer.
														
 
															+ * If any changes are made to the manifest which are backwards compatible,
														
 
															+ * this new manifest can still be loaded from JSON and processed.
														
 
															+ *
														
 
															+ * If the manifest is no longer compatible, the job output may
														
 
															+ * be invalid.
														
 
															+ *
														
 
															+ * It is CRITICAL that the {@link #VERSION} constant is updated whenever
														
 
															+ * such an incompatible change is made.
														
 
															+ */
														
 
															+@SuppressWarnings("unused")
														
 
															+@InterfaceAudience.Private
														
 
															+@InterfaceStability.Unstable
														
 
															+@JsonInclude(JsonInclude.Include.NON_NULL)
														
 
															+public class TaskManifest extends AbstractManifestData<TaskManifest> {
														
 
															+
														
 
															+  /**
														
 
															+   * Supported version value: {@value}.
														
 
															+   * If this is changed the value of {@code serialVersionUID} will change,
														
 
															+   * to avoid deserialization problems.
														
 
															+   */
														
 
															+  public static final int VERSION = 1;
														
 
															+
														
 
															+  /**
														
 
															+   * Manifest type.
														
 
															+   */
														
 
															+  public static final String TYPE =
														
 
															+      "org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest/"
														
 
															+      + VERSION;
														
 
															+
														
 
															+  private static final Logger LOG =
														
 
															+      LoggerFactory.getLogger(TaskManifest.class);
														
 
															+
														
 
															+  /**
														
 
															+   * Serialization version.
														
 
															+   */
														
 
															+  private static final long serialVersionUID = 7090285511966046094L + VERSION;
														
 
															+
														
 
															+  /**
														
 
															+   * Manifest type.
														
 
															+   */
														
 
															+  @JsonProperty("type")
														
 
															+  private String type = TYPE;
														
 
															+
														
 
															+  /** Version marker. */
														
 
															+  @JsonProperty("version")
														
 
															+  private int version = VERSION;
														
 
															+
														
 
															+  /**
														
 
															+   * Job ID; constant over multiple attempts.
														
 
															+   */
														
 
															+  @JsonProperty("jobId")
														
 
															+  private String jobId;
														
 
															+
														
 
															+  /**
														
 
															+   * Number of the job attempt; starts at zero.
														
 
															+   */
														
 
															+  @JsonProperty("jobAttemptNumber")
														
 
															+  private int jobAttemptNumber;
														
 
															+
														
 
															+  /**
														
 
															+   * Task Attempt ID.
														
 
															+   */
														
 
															+  @JsonProperty("taskID")
														
 
															+  private String taskID;
														
 
															+
														
 
															+  /**
														
 
															+   * Task Attempt ID.
														
 
															+   */
														
 
															+  @JsonProperty("taskAttemptID")
														
 
															+  private String taskAttemptID;
														
 
															+
														
 
															+  /**
														
 
															+   * The task attempt directory.
														
 
															+   */
														
 
															+  @JsonProperty("taskAttemptDir")
														
 
															+  private String taskAttemptDir;
														
 
															+
														
 
															+  /**
														
 
															+   * The list of files to commit from this task attempt, including
														
 
															+   * precalculated destination and size.
														
 
															+   */
														
 
															+  @JsonProperty("files")
														
 
															+  private final List<FileEntry> filesToCommit = new ArrayList<>();
														
 
															+
														
 
															+  /**
														
 
															+   * The list of directories needed by this task attempt, both
														
 
															+   * source and destination.
														
 
															+   * All these directories must exist in the destination before any of
														
 
															+   * the files can be renamed there.
														
 
															+   */
														
 
															+  @JsonProperty("directories")
														
 
															+  private final List<DirEntry> destDirectories = new ArrayList<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Any custom extra data committers may choose to add.
														
 
															+   */
														
 
															+  private final Map<String, String> extraData = new HashMap<>(0);
														
 
															+
														
 
															+  /**
														
 
															+   * IOStatistics.
														
 
															+   */
														
 
															+  @JsonProperty("iostatistics")
														
 
															+  private IOStatisticsSnapshot iostatistics = new IOStatisticsSnapshot();
														
 
															+
														
 
															+  /**
														
 
															+   * Empty constructor; will be used by jackson as well as in application
														
 
															+   * code.
														
 
															+   */
														
 
															+  public TaskManifest() {
														
 
															+  }
														
 
															+
														
 
															+  public String getType() {
														
 
															+    return type;
														
 
															+  }
														
 
															+
														
 
															+  public void setType(String type) {
														
 
															+    this.type = type;
														
 
															+  }
														
 
															+
														
 
															+  public int getVersion() {
														
 
															+    return version;
														
 
															+  }
														
 
															+
														
 
															+  public void setVersion(int version) {
														
 
															+    this.version = version;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public IOStatisticsSnapshot getIOStatistics() {
														
 
															+    return iostatistics;
														
 
															+  }
														
 
															+
														
 
															+  public void setIOStatistics(
														
 
															+      @Nullable final IOStatisticsSnapshot ioStatistics) {
														
 
															+    this.iostatistics = ioStatistics;
														
 
															+  }
														
 
															+
														
 
															+  public String getJobId() {
														
 
															+    return jobId;
														
 
															+  }
														
 
															+
														
 
															+  public void setJobId(final String jobId) {
														
 
															+    this.jobId = jobId;
														
 
															+  }
														
 
															+
														
 
															+  public int getJobAttemptNumber() {
														
 
															+    return jobAttemptNumber;
														
 
															+  }
														
 
															+
														
 
															+  public void setJobAttemptNumber(final int jobAttemptNumber) {
														
 
															+    this.jobAttemptNumber = jobAttemptNumber;
														
 
															+  }
														
 
															+
														
 
															+  public String getTaskID() {
														
 
															+    return taskID;
														
 
															+  }
														
 
															+
														
 
															+  public void setTaskID(final String taskID) {
														
 
															+    this.taskID = taskID;
														
 
															+  }
														
 
															+
														
 
															+  public String getTaskAttemptID() {
														
 
															+    return taskAttemptID;
														
 
															+  }
														
 
															+
														
 
															+  public void setTaskAttemptID(final String taskAttemptID) {
														
 
															+    this.taskAttemptID = taskAttemptID;
														
 
															+  }
														
 
															+
														
 
															+  public String getTaskAttemptDir() {
														
 
															+    return taskAttemptDir;
														
 
															+  }
														
 
															+
														
 
															+  public void setTaskAttemptDir(final String taskAttemptDir) {
														
 
															+    this.taskAttemptDir = taskAttemptDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a file to the list of files to commit.
														
 
															+   * @param entry entry  to add
														
 
															+   */
														
 
															+  public void addFileToCommit(FileEntry entry) {
														
 
															+    filesToCommit.add(entry);
														
 
															+  }
														
 
															+
														
 
															+  public List<FileEntry> getFilesToCommit() {
														
 
															+    return filesToCommit;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Calculate the total amount of data which will be committed.
														
 
															+   * @return the sum of sizes of all files to commit.
														
 
															+   */
														
 
															+  @JsonIgnore
														
 
															+  public long getTotalFileSize() {
														
 
															+    return filesToCommit.stream().mapToLong(FileEntry::getSize).sum();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * All the destination directories.
														
 
															+   * @return directory list.
														
 
															+   */
														
 
															+  public List<DirEntry> getDestDirectories() {
														
 
															+    return destDirectories;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a directory to the list of directories to create.
														
 
															+   * @param entry entry  to add
														
 
															+   */
														
 
															+  public void addDirectory(DirEntry entry) {
														
 
															+    destDirectories.add(entry);
														
 
															+  }
														
 
															+
														
 
															+  public Map<String, String> getExtraData() {
														
 
															+    return extraData;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public byte[] toBytes() throws IOException {
														
 
															+    return serializer().toBytes(this);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * To JSON.
														
 
															+   * @return json string value.
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  public String toJson() throws IOException {
														
 
															+    return serializer().toJson(this);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void save(FileSystem fs, Path path, boolean overwrite)
														
 
															+      throws IOException {
														
 
															+    serializer().save(fs, path, this, overwrite);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Validate the data: those fields which must be non empty, must be set.
														
 
															+   * @throws IOException if the data is invalid
														
 
															+   * @return
														
 
															+   */
														
 
															+  public TaskManifest validate() throws IOException {
														
 
															+    verify(TYPE.equals(type), "Wrong type: %s", type);
														
 
															+    verify(version == VERSION, "Wrong version: %s", version);
														
 
															+    validateCollectionClass(extraData.keySet(), String.class);
														
 
															+    validateCollectionClass(extraData.values(), String.class);
														
 
															+    Set<String> destinations = new HashSet<>(filesToCommit.size());
														
 
															+    validateCollectionClass(filesToCommit, FileEntry.class);
														
 
															+    for (FileEntry c : filesToCommit) {
														
 
															+      c.validate();
														
 
															+      verify(!destinations.contains(c.getDest()),
														
 
															+          "Destination %s is written to by more than one pending commit",
														
 
															+          c.getDest());
														
 
															+      destinations.add(c.getDest());
														
 
															+    }
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get a JSON serializer for this class.
														
 
															+   * @return a serializer.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public JsonSerialization<TaskManifest> createSerializer() {
														
 
															+    return serializer();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a JSON serializer for this class.
														
 
															+   * @return a serializer.
														
 
															+   */
														
 
															+  public static JsonSerialization<TaskManifest> serializer() {
														
 
															+    return new JsonSerialization<>(TaskManifest.class, false, true);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load an instance from a file, then validate it.
														
 
															+   * @param fs filesystem
														
 
															+   * @param path path
														
 
															+   * @return the loaded instance
														
 
															+   * @throws IOException IO failure/the data is invalid
														
 
															+   */
														
 
															+  public static TaskManifest load(FileSystem fs, Path path)
														
 
															+      throws IOException {
														
 
															+    LOG.debug("Reading Manifest in file {}", path);
														
 
															+    return serializer().load(fs, path).validate();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load an instance from a file, then validate it.
														
 
															+   * If loading through a listing; use this API so that filestatus
														
 
															+   * hints can be used.
														
 
															+   * @param serializer serializer.
														
 
															+   * @param fs filesystem
														
 
															+   * @param path path to load from
														
 
															+   * @param status status of file to load
														
 
															+   * @return the loaded instance
														
 
															+   * @throws IOException IO failure/the data is invalid
														
 
															+   */
														
 
															+  public static TaskManifest load(
														
 
															+      JsonSerialization<TaskManifest> serializer,
														
 
															+      FileSystem fs,
														
 
															+      Path path,
														
 
															+      FileStatus status)
														
 
															+      throws IOException {
														
 
															+    LOG.debug("Reading Manifest in file {}", path);
														
 
															+    return serializer.load(fs, path, status)
														
 
															+        .validate();
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/package-info.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/package-info.java
@@ -0,0 +1,41 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+/**
														
 
															+ * Persistence formats.
														
 
															+ * These are the persistence formats used for passing data from tasks
														
 
															+ * to the job committer
														
 
															+ * {@link org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest},
														
 
															+ * and for a {@code _SUCCESS} file, which is in
														
 
															+ * {@link org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData}.
														
 
															+ * The {@code _SUCCESS} file is a copy of the S3A Committer
														
 
															+ * {@code org.apache.hadoop.fs.s3a.commit.files.ManifestSuccessData},
														
 
															+ * the intent being that at the JSON-level they are compatible.
														
 
															+ * This is to aid testing/validation and support calls, with one single
														
 
															+ * format to load.
														
 
															+ *
														
 
															+ * Consult the individual formats for their declarations of access;
														
 
															+ * the _SUCCESS file is one which tests may use.
														
 
															+ *
														
 
															+ */
														
 
															+@InterfaceAudience.Public
														
 
															+@InterfaceStability.Unstable
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/AuditingIntegration.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/AuditingIntegration.java
@@ -0,0 +1,94 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.fs.audit.CommonAuditContext;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConfig;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.audit.AuditConstants.PARAM_JOB_ID;
														
 
															+import static org.apache.hadoop.fs.audit.CommonAuditContext.currentAuditContext;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.CONTEXT_ATTR_STAGE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.CONTEXT_ATTR_TASK_ATTEMPT_ID;
														
 
															+
														
 
															+/**
														
 
															+ * Helper class to support integration with Hadoop 3.3.2+ Auditing.
														
 
															+ * This MUST BE the sole place where fs.audit methods are used, so can be replaced
														
 
															+ * by a stub class on any backport.
														
 
															+ */
														
 
															+@InterfaceAudience.Private
														
 
															+public final class AuditingIntegration {
														
 
															+  private AuditingIntegration() {
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add jobID to current context; also
														
 
															+   * task attempt ID if set.
														
 
															+   */
														
 
															+  public static void updateCommonContextOnCommitterEntry(
														
 
															+      ManifestCommitterConfig committerConfig) {
														
 
															+    CommonAuditContext context = currentAuditContext();
														
 
															+    context.put(PARAM_JOB_ID,
														
 
															+        committerConfig.getJobUniqueId());
														
 
															+    // maybe the task attempt ID.
														
 
															+    if (!committerConfig.getTaskAttemptId().isEmpty()) {
														
 
															+      context.put(CONTEXT_ATTR_TASK_ATTEMPT_ID,
														
 
															+          committerConfig.getTaskAttemptId());
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Callback on stage entry.
														
 
															+   * Sets the activeStage and updates the
														
 
															+   * common context.
														
 
															+   * @param stage new stage
														
 
															+   */
														
 
															+  public static void enterStage(String stage) {
														
 
															+    currentAuditContext().put(CONTEXT_ATTR_STAGE, stage);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Remove stage from common audit context.
														
 
															+   */
														
 
															+  public static void exitStage() {
														
 
															+    currentAuditContext().remove(CONTEXT_ATTR_STAGE);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Remove commit info at the end of the task or job.
														
 
															+   */
														
 
															+  public static void updateCommonContextOnCommitterExit() {
														
 
															+    currentAuditContext().remove(PARAM_JOB_ID);
														
 
															+    currentAuditContext().remove(CONTEXT_ATTR_TASK_ATTEMPT_ID);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Update the thread context with the stage name and
														
 
															+   * job ID.
														
 
															+   * This MUST be invoked at the start of methods invoked in helper threads,
														
 
															+   * to ensure that they are all annotated with job and stage.
														
 
															+   * @param jobId job ID.
														
 
															+   * @param stage stage name.
														
 
															+   */
														
 
															+  public static void enterStageWorker(String jobId, String stage) {
														
 
															+    CommonAuditContext context = currentAuditContext();
														
 
															+    context.put(PARAM_JOB_ID, jobId);
														
 
															+    context.put(CONTEXT_ATTR_STAGE, stage);
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/InternalConstants.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/InternalConstants.java
@@ -0,0 +1,130 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
														
 
															+
														
 
															+import java.util.Set;
														
 
															+
														
 
															+import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableSet;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OBJECT_CONTINUE_LIST_REQUEST;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OBJECT_LIST_REQUEST;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_GET_FILE_STATUS;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_IS_DIRECTORY;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_IS_FILE;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_LIST_STATUS;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_MKDIRS;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.STORE_IO_RATE_LIMITED;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.*;
														
 
															+
														
 
															+/**
														
 
															+ * Constants internal to the manifest committer.
														
 
															+ */
														
 
															+@InterfaceAudience.Private
														
 
															+public final class InternalConstants {
														
 
															+  private InternalConstants() {
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Durations.
														
 
															+   */
														
 
															+  public static final String[] DURATION_STATISTICS = {
														
 
															+
														
 
															+      /* Job stages. */
														
 
															+      OP_STAGE_JOB_ABORT,
														
 
															+      OP_STAGE_JOB_CLEANUP,
														
 
															+      OP_STAGE_JOB_COMMIT,
														
 
															+      OP_STAGE_JOB_CREATE_TARGET_DIRS,
														
 
															+      OP_STAGE_JOB_LOAD_MANIFESTS,
														
 
															+      OP_STAGE_JOB_RENAME_FILES,
														
 
															+      OP_STAGE_JOB_SAVE_SUCCESS,
														
 
															+      OP_STAGE_JOB_SETUP,
														
 
															+      OP_STAGE_JOB_VALIDATE_OUTPUT,
														
 
															+
														
 
															+      /* Task stages. */
														
 
															+
														
 
															+      OP_STAGE_TASK_ABORT_TASK,
														
 
															+      OP_STAGE_TASK_COMMIT,
														
 
															+      OP_STAGE_TASK_SAVE_MANIFEST,
														
 
															+      OP_STAGE_TASK_SCAN_DIRECTORY,
														
 
															+      OP_STAGE_TASK_SETUP,
														
 
															+
														
 
															+      /* Lower level store/fs operations. */
														
 
															+      OP_COMMIT_FILE_RENAME,
														
 
															+      OP_CREATE_DIRECTORIES,
														
 
															+      OP_CREATE_ONE_DIRECTORY,
														
 
															+      OP_DIRECTORY_SCAN,
														
 
															+      OP_DELETE,
														
 
															+      OP_DELETE_FILE_UNDER_DESTINATION,
														
 
															+      OP_GET_FILE_STATUS,
														
 
															+      OP_IS_DIRECTORY,
														
 
															+      OP_IS_FILE,
														
 
															+      OP_LIST_STATUS,
														
 
															+      OP_LOAD_MANIFEST,
														
 
															+      OP_LOAD_ALL_MANIFESTS,
														
 
															+      OP_MKDIRS,
														
 
															+      OP_MKDIRS_RETURNED_FALSE,
														
 
															+      OP_MSYNC,
														
 
															+      OP_PREPARE_DIR_ANCESTORS,
														
 
															+      OP_RENAME_FILE,
														
 
															+      OP_SAVE_TASK_MANIFEST,
														
 
															+
														
 
															+      OBJECT_LIST_REQUEST,
														
 
															+      OBJECT_CONTINUE_LIST_REQUEST,
														
 
															+
														
 
															+      STORE_IO_RATE_LIMITED
														
 
															+  };
														
 
															+
														
 
															+  /**
														
 
															+   * Counters.
														
 
															+   */
														
 
															+  public static final String[] COUNTER_STATISTICS = {
														
 
															+      COMMITTER_BYTES_COMMITTED_COUNT,
														
 
															+      COMMITTER_FILES_COMMITTED_COUNT,
														
 
															+      COMMITTER_TASKS_COMPLETED_COUNT,
														
 
															+      COMMITTER_TASKS_FAILED_COUNT,
														
 
															+      COMMITTER_TASK_DIRECTORY_COUNT_MEAN,
														
 
															+      COMMITTER_TASK_DIRECTORY_DEPTH_MEAN,
														
 
															+      COMMITTER_TASK_FILE_COUNT_MEAN,
														
 
															+      COMMITTER_TASK_FILE_SIZE_MEAN,
														
 
															+      COMMITTER_TASK_MANIFEST_FILE_SIZE,
														
 
															+      OP_COMMIT_FILE_RENAME_RECOVERED,
														
 
															+  };
														
 
															+
														
 
															+  /**
														
 
															+   * Error string from ABFS connector on timeout.
														
 
															+   */
														
 
															+  public static final String OPERATION_TIMED_OUT = "OperationTimedOut";
														
 
															+
														
 
															+  /**
														
 
															+   * Format string for task attempt names.
														
 
															+   */
														
 
															+  public static final String NAME_FORMAT_TASK_ATTEMPT = "[Task-Attempt %s]";
														
 
															+
														
 
															+  /**
														
 
															+   * Format string for job attempt names.
														
 
															+   */
														
 
															+  public static final String NAME_FORMAT_JOB_ATTEMPT = "[Job-Attempt %s]";
														
 
															+
														
 
															+  /** Schemas of filesystems we know to not work with this committer. */
														
 
															+  public static final Set<String> UNSUPPORTED_FS_SCHEMAS =
														
 
															+      ImmutableSet.of("s3a", "wasb");
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestCommitterSupport.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestCommitterSupport.java
@@ -0,0 +1,374 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.time.ZonedDateTime;
														
 
															+
														
 
															+import org.apache.commons.lang3.tuple.Pair;
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.conf.Configuration;
														
 
															+import org.apache.hadoop.fs.EtagSource;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.PathIOException;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsAggregator;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStoreBuilder;
														
 
															+import org.apache.hadoop.mapreduce.JobContext;
														
 
															+import org.apache.hadoop.mapreduce.JobID;
														
 
															+import org.apache.hadoop.mapreduce.MRJobConfig;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+import org.apache.hadoop.net.NetUtils;
														
 
															+import org.apache.hadoop.security.UserGroupInformation;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.iostatisticsStore;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.PENDING_DIR_NAME;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.INITIAL_APP_ATTEMPT_ID;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_ATTEMPT_DIR_FORMAT_STR;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_DIR_FORMAT_STR;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_ID_SOURCE_MAPREDUCE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_TASK_ATTEMPT_SUBDIR;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_TASK_MANIFEST_SUBDIR;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_CLASSNAME;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_SUFFIX;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_STORE_OPERATIONS_CLASS;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SPARK_WRITE_UUID;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUMMARY_FILENAME_FORMAT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.TMP_SUFFIX;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.PRINCIPAL;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.STAGE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.COUNTER_STATISTICS;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.DURATION_STATISTICS;
														
 
															+
														
 
															+/**
														
 
															+ * Class for manifest committer support util methods.
														
 
															+ */
														
 
															+
														
 
															+@InterfaceAudience.Private
														
 
															+public final class ManifestCommitterSupport {
														
 
															+
														
 
															+  private ManifestCommitterSupport() {
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create an IOStatistics Store with the standard statistics
														
 
															+   * set up.
														
 
															+   * @return a store builder preconfigured with the standard stats.
														
 
															+   */
														
 
															+  public static IOStatisticsStoreBuilder createIOStatisticsStore() {
														
 
															+
														
 
															+    final IOStatisticsStoreBuilder store
														
 
															+        = iostatisticsStore();
														
 
															+
														
 
															+    store.withCounters(COUNTER_STATISTICS);
														
 
															+    store.withMaximums(COUNTER_STATISTICS);
														
 
															+    store.withMinimums(COUNTER_STATISTICS);
														
 
															+    store.withMeanStatistics(COUNTER_STATISTICS);
														
 
															+    store.withDurationTracking(DURATION_STATISTICS);
														
 
															+    return store;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * If the object is an IOStatisticsSource, get and add
														
 
															+   * its IOStatistics.
														
 
															+   * @param o source object.
														
 
															+   */
														
 
															+  public static void maybeAddIOStatistics(IOStatisticsAggregator ios,
														
 
															+      Object o) {
														
 
															+    if (o instanceof IOStatisticsSource) {
														
 
															+      ios.aggregate(((IOStatisticsSource) o).getIOStatistics());
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Build a Job UUID from the job conf (if it is
														
 
															+   * {@link ManifestCommitterConstants#SPARK_WRITE_UUID}
														
 
															+   * or the MR job ID.
														
 
															+   * @param conf job/task configuration
														
 
															+   * @param jobId job ID from YARN or spark.
														
 
															+   * @return (a job ID, source)
														
 
															+   */
														
 
															+  public static Pair<String, String> buildJobUUID(Configuration conf,
														
 
															+      JobID jobId) {
														
 
															+    String jobUUID = conf.getTrimmed(SPARK_WRITE_UUID, "");
														
 
															+    if (jobUUID.isEmpty()) {
														
 
															+      jobUUID = jobId.toString();
														
 
															+      return Pair.of(jobUUID, JOB_ID_SOURCE_MAPREDUCE);
														
 
															+    } else {
														
 
															+      return Pair.of(jobUUID, SPARK_WRITE_UUID);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the location of pending job attempts.
														
 
															+   * @param out the base output directory.
														
 
															+   * @return the location of pending job attempts.
														
 
															+   */
														
 
															+  public static Path getPendingJobAttemptsPath(Path out) {
														
 
															+    return new Path(out, PENDING_DIR_NAME);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the Application Attempt Id for this job.
														
 
															+   * @param context the context to look in
														
 
															+   * @return the Application Attempt Id for a given job.
														
 
															+   */
														
 
															+  public static int getAppAttemptId(JobContext context) {
														
 
															+    return getAppAttemptId(context.getConfiguration());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the Application Attempt Id for this job
														
 
															+   * by looking for {@link MRJobConfig#APPLICATION_ATTEMPT_ID}
														
 
															+   * in the configuration, falling back to 0 if unset.
														
 
															+   * For spark it will always be 0, for MR it will be set in the AM
														
 
															+   * to the {@code ApplicationAttemptId} the AM is launched with.
														
 
															+   * @param conf job configuration.
														
 
															+   * @return the Application Attempt Id for the job.
														
 
															+   */
														
 
															+  public static int getAppAttemptId(Configuration conf) {
														
 
															+    return conf.getInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
														
 
															+        INITIAL_APP_ATTEMPT_ID);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the path in the job attempt dir for a manifest for a task.
														
 
															+   * @param manifestDir manifest directory
														
 
															+   * @param taskId taskID.
														
 
															+   * @return the final path to rename the manifest file to
														
 
															+   */
														
 
															+  public static Path manifestPathForTask(Path manifestDir, String taskId) {
														
 
															+
														
 
															+    return new Path(manifestDir, taskId + MANIFEST_SUFFIX);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the path in the  manifest subdir for the temp path to save a
														
 
															+   * task attempt's manifest before renaming it to the
														
 
															+   * path defined by {@link #manifestPathForTask(Path, String)}.
														
 
															+   * @param manifestDir manifest directory
														
 
															+   * @param taskAttemptId task attempt ID.
														
 
															+   * @return the path to save/load the manifest.
														
 
															+   */
														
 
															+  public static Path manifestTempPathForTaskAttempt(Path manifestDir,
														
 
															+      String taskAttemptId) {
														
 
															+    return new Path(manifestDir,
														
 
															+        taskAttemptId + MANIFEST_SUFFIX + TMP_SUFFIX);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a task attempt dir; stage config must be for a task attempt.
														
 
															+   * @param stageConfig state config.
														
 
															+   * @return a manifest with job and task attempt info set up.
														
 
															+   */
														
 
															+  public static TaskManifest createTaskManifest(StageConfig stageConfig) {
														
 
															+    final TaskManifest manifest = new TaskManifest();
														
 
															+    manifest.setTaskAttemptID(stageConfig.getTaskAttemptId());
														
 
															+    manifest.setTaskID(stageConfig.getTaskId());
														
 
															+    manifest.setJobId(stageConfig.getJobId());
														
 
															+    manifest.setJobAttemptNumber(stageConfig.getJobAttemptNumber());
														
 
															+    manifest.setTaskAttemptDir(
														
 
															+        stageConfig.getTaskAttemptDir().toUri().toString());
														
 
															+    return manifest;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create success/outcome data.
														
 
															+   * @param stageConfig configuration.
														
 
															+   * @param stage
														
 
															+   * @return a _SUCCESS object with some diagnostics.
														
 
															+   */
														
 
															+  public static ManifestSuccessData createManifestOutcome(
														
 
															+      StageConfig stageConfig, String stage) {
														
 
															+    final ManifestSuccessData outcome = new ManifestSuccessData();
														
 
															+    outcome.setJobId(stageConfig.getJobId());
														
 
															+    outcome.setJobIdSource(stageConfig.getJobIdSource());
														
 
															+    outcome.setCommitter(MANIFEST_COMMITTER_CLASSNAME);
														
 
															+    // real timestamp
														
 
															+    outcome.setTimestamp(System.currentTimeMillis());
														
 
															+    final ZonedDateTime now = ZonedDateTime.now();
														
 
															+    outcome.setDate(now.toString());
														
 
															+    outcome.setHostname(NetUtils.getLocalHostname());
														
 
															+    // add some extra diagnostics which can still be parsed by older
														
 
															+    // builds of test applications.
														
 
															+    // Audit Span information can go in here too, in future.
														
 
															+    try {
														
 
															+      outcome.putDiagnostic(PRINCIPAL,
														
 
															+          UserGroupInformation.getCurrentUser().getShortUserName());
														
 
															+    } catch (IOException ignored) {
														
 
															+      // don't know who we are? exclude from the diagnostics.
														
 
															+    }
														
 
															+    outcome.putDiagnostic(STAGE, stage);
														
 
															+    return outcome;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create the filename for a report from the jobID.
														
 
															+   * @param jobId jobId
														
 
															+   * @return filename for a report.
														
 
															+   */
														
 
															+  public static String createJobSummaryFilename(String jobId) {
														
 
															+    return String.format(SUMMARY_FILENAME_FORMAT, jobId);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get an etag from a FileStatus which MUST BE
														
 
															+   * an implementation of EtagSource and
														
 
															+   * whose etag MUST NOT BE null/empty.
														
 
															+   * @param status the status; may be null.
														
 
															+   * @return the etag or null if not provided
														
 
															+   */
														
 
															+  public static String getEtag(FileStatus status) {
														
 
															+    if (status instanceof EtagSource) {
														
 
															+      return ((EtagSource) status).getEtag();
														
 
															+    } else {
														
 
															+      return null;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create the manifest store operations for the given FS.
														
 
															+   * This supports binding to custom filesystem handlers.
														
 
															+   * @param conf configuration.
														
 
															+   * @param filesystem fs.
														
 
															+   * @param path path under FS.
														
 
															+   * @return a bonded store operations.
														
 
															+   * @throws IOException on binding/init problems.
														
 
															+   */
														
 
															+  public static ManifestStoreOperations createManifestStoreOperations(
														
 
															+      final Configuration conf,
														
 
															+      final FileSystem filesystem,
														
 
															+      final Path path) throws IOException {
														
 
															+    try {
														
 
															+      final Class<? extends ManifestStoreOperations> storeClass = conf.getClass(
														
 
															+          OPT_STORE_OPERATIONS_CLASS,
														
 
															+          ManifestStoreOperationsThroughFileSystem.class,
														
 
															+          ManifestStoreOperations.class);
														
 
															+      final ManifestStoreOperations operations = storeClass.
														
 
															+          getDeclaredConstructor().newInstance();
														
 
															+      operations.bindToFileSystem(filesystem, path);
														
 
															+      return operations;
														
 
															+    } catch (Exception e) {
														
 
															+      throw new PathIOException(path.toString(),
														
 
															+          "Failed to create Store Operations from configuration option "
														
 
															+              + OPT_STORE_OPERATIONS_CLASS
														
 
															+              + ":" + e, e);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Logic to create directory names from job and attempt.
														
 
															+   * This is self-contained it so it can be used in tests
														
 
															+   * as well as in the committer.
														
 
															+   */
														
 
															+  public static class AttemptDirectories {
														
 
															+
														
 
															+    /**
														
 
															+     * Job output path.
														
 
															+     */
														
 
															+    private final Path outputPath;
														
 
															+
														
 
															+    /**
														
 
															+     * Path for the job attempt.
														
 
															+     */
														
 
															+    private final Path jobAttemptDir;
														
 
															+
														
 
															+    /**
														
 
															+     * Path for the job.
														
 
															+     */
														
 
															+    private final Path jobPath;
														
 
															+
														
 
															+    /**
														
 
															+     * Subdir under the job attempt dir where task
														
 
															+     * attempts will have subdirectories.
														
 
															+     */
														
 
															+    private final Path jobAttemptTaskSubDir;
														
 
															+
														
 
															+    /**
														
 
															+     * temp directory under job dest dir.
														
 
															+     */
														
 
															+    private final Path outputTempSubDir;
														
 
															+
														
 
															+    /**
														
 
															+     * Directory to save manifests into.
														
 
															+     */
														
 
															+    private final Path taskManifestDir;
														
 
															+
														
 
															+    /**
														
 
															+     * Build the attempt directories.
														
 
															+     * @param outputPath output path
														
 
															+     * @param jobUniqueId job ID/UUID
														
 
															+     * @param jobAttemptNumber job attempt number
														
 
															+     */
														
 
															+    public AttemptDirectories(
														
 
															+        Path outputPath,
														
 
															+        String jobUniqueId,
														
 
															+        int jobAttemptNumber) {
														
 
															+      this.outputPath = requireNonNull(outputPath, "Output path");
														
 
															+
														
 
															+      this.outputTempSubDir = new Path(outputPath, PENDING_DIR_NAME);
														
 
															+      // build the path for the job
														
 
															+      this.jobPath = new Path(outputTempSubDir,
														
 
															+          String.format(JOB_DIR_FORMAT_STR, jobUniqueId));
														
 
															+
														
 
															+      // then the specific path underneath that for the attempt.
														
 
															+      this.jobAttemptDir = new Path(jobPath,
														
 
															+          String.format(JOB_ATTEMPT_DIR_FORMAT_STR, jobAttemptNumber));
														
 
															+
														
 
															+      // subdir for task attempts.
														
 
															+      this.jobAttemptTaskSubDir = new Path(jobAttemptDir, JOB_TASK_ATTEMPT_SUBDIR);
														
 
															+
														
 
															+      this.taskManifestDir = new Path(jobAttemptDir, JOB_TASK_MANIFEST_SUBDIR);
														
 
															+    }
														
 
															+
														
 
															+    public Path getOutputPath() {
														
 
															+      return outputPath;
														
 
															+    }
														
 
															+
														
 
															+    public Path getJobAttemptDir() {
														
 
															+      return jobAttemptDir;
														
 
															+    }
														
 
															+
														
 
															+    public Path getJobPath() {
														
 
															+      return jobPath;
														
 
															+    }
														
 
															+
														
 
															+    public Path getJobAttemptTaskSubDir() {
														
 
															+      return jobAttemptTaskSubDir;
														
 
															+    }
														
 
															+
														
 
															+    public Path getTaskAttemptPath(String taskAttemptId) {
														
 
															+      return new Path(jobAttemptTaskSubDir, taskAttemptId);
														
 
															+    }
														
 
															+
														
 
															+    public Path getOutputTempSubDir() {
														
 
															+      return outputTempSubDir;
														
 
															+    }
														
 
															+
														
 
															+    public Path getTaskManifestDir() {
														
 
															+      return taskManifestDir;
														
 
															+    }
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperations.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperations.java
@@ -0,0 +1,291 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
														
 
															+
														
 
															+import javax.annotation.Nullable;
														
 
															+import java.io.Closeable;
														
 
															+import java.io.FileNotFoundException;
														
 
															+import java.io.IOException;
														
 
															+import java.time.Duration;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.RemoteIterator;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.util.JsonSerialization;
														
 
															+
														
 
															+/**
														
 
															+ * FileSystem operations which are needed to generate the task manifest.
														
 
															+ * The specific choice of which implementation to use is configurable.
														
 
															+ * Object store implementations MAY subclass if they
														
 
															+ * need to implement resilient commit operations.
														
 
															+ * However, the actual API MUST NOT be used outside
														
 
															+ * the manifest committer and its tests.
														
 
															+ */
														
 
															+@InterfaceAudience.LimitedPrivate("mapreduce, object-stores")
														
 
															+@InterfaceStability.Unstable
														
 
															+public abstract class ManifestStoreOperations implements Closeable {
														
 
															+
														
 
															+  /**
														
 
															+   * Bind to the filesystem.
														
 
															+   * This is called by the manifest committer after the operations
														
 
															+   * have been instantiated.
														
 
															+   * @param fileSystem target FS
														
 
															+   * @param path actual path under FS.
														
 
															+   * @throws IOException if there are binding problems.
														
 
															+   */
														
 
															+  public void bindToFileSystem(FileSystem fileSystem, Path path) throws IOException {
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Forward to {@link FileSystem#getFileStatus(Path)}.
														
 
															+   * @param path path
														
 
															+   * @return status
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  public abstract FileStatus getFileStatus(Path path) throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Is a path a file? Used during directory creation.
														
 
															+   * The is a copy & paste of FileSystem.isFile();
														
 
															+   * {@code StoreOperationsThroughFileSystem} calls into
														
 
															+   * the FS direct so that stores which optimize their probes
														
 
															+   * can save on IO.
														
 
															+   * @param path path to probe
														
 
															+   * @return true if the path exists and resolves to a file
														
 
															+   * @throws IOException failure other than FileNotFoundException
														
 
															+   */
														
 
															+  public boolean isFile(Path path) throws IOException {
														
 
															+    try {
														
 
															+      return getFileStatus(path).isFile();
														
 
															+    } catch (FileNotFoundException e) {
														
 
															+      return false;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Forward to {@link FileSystem#delete(Path, boolean)}.
														
 
															+   * If it returns without an error: there is nothing at
														
 
															+   * the end of the path.
														
 
															+   * @param path path
														
 
															+   * @param recursive recursive delete.
														
 
															+   * @return true if the path was deleted.
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  public abstract boolean delete(Path path, boolean recursive)
														
 
															+      throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Forward to {@link FileSystem#mkdirs(Path)}.
														
 
															+   * Usual "what does 'false' mean" ambiguity.
														
 
															+   * @param path path
														
 
															+   * @return true if the directory was created.
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  public abstract boolean mkdirs(Path path) throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Forward to {@link FileSystem#rename(Path, Path)}.
														
 
															+   * Usual "what does 'false' mean" ambiguity.
														
 
															+   * @param source source file
														
 
															+   * @param dest destination path -which must not exist.
														
 
															+   * @return the return value of the rename
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  public abstract boolean renameFile(Path source, Path dest)
														
 
															+      throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Rename a dir; defaults to invoking
														
 
															+   * Forward to {@link #renameFile(Path, Path)}.
														
 
															+   * Usual "what does 'false' mean?" ambiguity.
														
 
															+   * @param source source file
														
 
															+   * @param dest destination path -which must not exist.
														
 
															+   * @return true if the directory was created.
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  public boolean renameDir(Path source, Path dest)
														
 
															+      throws IOException {
														
 
															+    return renameFile(source, dest);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * List the directory.
														
 
															+   * @param path path to list.
														
 
															+   * @return an iterator over the results.
														
 
															+   * @throws IOException any immediate failure.
														
 
															+   */
														
 
															+  public abstract RemoteIterator<FileStatus> listStatusIterator(Path path)
														
 
															+      throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Load a task manifest from the store.
														
 
															+   * with a real FS, this is done with
														
 
															+   * {@link TaskManifest#load(JsonSerialization, FileSystem, Path, FileStatus)}
														
 
															+   *
														
 
															+   * @param serializer serializer.
														
 
															+   * @param st status with the path and other data.
														
 
															+   * @return the manifest
														
 
															+   * @throws IOException failure to load/parse
														
 
															+   */
														
 
															+  public abstract TaskManifest loadTaskManifest(
														
 
															+      JsonSerialization<TaskManifest> serializer,
														
 
															+      FileStatus st) throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Save a task manifest by {@code FileSystem.create(path)}.
														
 
															+   * there's no attempt at renaming anything here.
														
 
															+   * @param manifestData the manifest/success file
														
 
															+   * @param path temp path for the initial save
														
 
															+   * @param overwrite should create(overwrite=true) be used?
														
 
															+   * @throws IOException failure to load/parse
														
 
															+   */
														
 
															+  public abstract <T extends AbstractManifestData<T>> void save(
														
 
															+      T manifestData,
														
 
															+      Path path,
														
 
															+      boolean overwrite) throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Make an msync() call; swallow when unsupported.
														
 
															+   * @param path path
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  public void msync(Path path) throws IOException {
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Extract an etag from a status if the conditions are met.
														
 
															+   * If the conditions are not met, return null or ""; they will
														
 
															+   * both be treated as "no etags available"
														
 
															+   * <pre>
														
 
															+   *   1. The status is of a type which the implementation recognizes
														
 
															+   *   as containing an etag.
														
 
															+   *   2. After casting the etag field can be retrieved
														
 
															+   *   3. and that value is non-null/non-empty.
														
 
															+   * </pre>
														
 
															+   * @param status status, which may be null of any subclass of FileStatus.
														
 
															+   * @return either a valid etag, or null or "".
														
 
															+   */
														
 
															+  public String getEtag(FileStatus status) {
														
 
															+    return ManifestCommitterSupport.getEtag(status);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Does the store preserve etags through renames.
														
 
															+   * If true, and if the source listing entry has an etag,
														
 
															+   * it will be used to attempt to validate a failed rename.
														
 
															+   * @param path path to probe.
														
 
															+   * @return true if etag comparison is a valid strategy.
														
 
															+   */
														
 
															+  public boolean storePreservesEtagsThroughRenames(Path path) {
														
 
															+    return false;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Does the store provide rename resilience through an
														
 
															+   * implementation of {@link #commitFile(FileEntry)}?
														
 
															+   * If true then that method will be invoked to commit work
														
 
															+   * @return true if resilient commit support is available.
														
 
															+   */
														
 
															+  public boolean storeSupportsResilientCommit() {
														
 
															+    return false;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Commit one file through any resilient API.
														
 
															+   * This operation MUST rename source to destination,
														
 
															+   * else raise an exception.
														
 
															+   * The result indicates whether or not some
														
 
															+   * form of recovery took place.
														
 
															+   *
														
 
															+   * If etags were collected during task commit, these will be
														
 
															+   * in the entries passed in here.
														
 
															+   *
														
 
															+   * The base implementation always raises
														
 
															+   * {@code UnsupportedOperationException}
														
 
															+   * @param entry entry to commit
														
 
															+   * @return the result of the commit
														
 
															+   * @throws IOException failure.
														
 
															+   * @throws UnsupportedOperationException if not available.
														
 
															+   *
														
 
															+   */
														
 
															+  public CommitFileResult commitFile(FileEntry entry) throws IOException {
														
 
															+    throw new UnsupportedOperationException("Resilient commit not supported");
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Outcome from the operation {@link #commitFile(FileEntry)}.
														
 
															+   * As a rename failure MUST raise an exception, this result
														
 
															+   * only declares whether or not some form of recovery took place.
														
 
															+   */
														
 
															+  public static final class CommitFileResult {
														
 
															+
														
 
															+    /** Did recovery take place? */
														
 
															+    private final boolean recovered;
														
 
															+
														
 
															+    /** Time waiting for IO capacity, may be null. */
														
 
															+    @Nullable
														
 
															+    private final Duration waitTime;
														
 
															+
														
 
															+    /**
														
 
															+     * Full commit result.
														
 
															+     * @param recovered Did recovery take place?
														
 
															+     * @param waitTime any time spent waiting for IO capacity.
														
 
															+     */
														
 
															+    public static CommitFileResult fromResilientCommit(
														
 
															+        final boolean recovered,
														
 
															+        final Duration waitTime) {
														
 
															+      return new CommitFileResult(recovered, waitTime);
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Full commit result.
														
 
															+     * @param recovered Did recovery take place?
														
 
															+     * @param waitTime any time spent waiting for IO capacity.
														
 
															+     */
														
 
															+    public CommitFileResult(final boolean recovered,
														
 
															+        @Nullable final Duration waitTime) {
														
 
															+
														
 
															+      this.recovered = recovered;
														
 
															+      this.waitTime = waitTime;
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Did some form of recovery take place?
														
 
															+     * @return true if the commit succeeded through some form of (etag-based) recovery
														
 
															+     */
														
 
															+    public boolean recovered() {
														
 
															+      return recovered;
														
 
															+    }
														
 
															+
														
 
															+    @Nullable
														
 
															+    public Duration getWaitTime() {
														
 
															+      return waitTime;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java
@@ -0,0 +1,187 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.CommonPathCapabilities;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.RemoteIterator;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.util.JsonSerialization;
														
 
															+
														
 
															+/**
														
 
															+ * Implementation of manifest store operations through the filesystem API.
														
 
															+ * This class is subclassed in the ABFS module, which does add the resilient
														
 
															+ * commit method.
														
 
															+ */
														
 
															+@InterfaceAudience.LimitedPrivate("mapreduce, object-stores")
														
 
															+@InterfaceStability.Unstable
														
 
															+public class ManifestStoreOperationsThroughFileSystem extends ManifestStoreOperations {
														
 
															+
														
 
															+  /**
														
 
															+   * Filesystem; set in {@link #bindToFileSystem(FileSystem, Path)}.
														
 
															+   */
														
 
															+  private FileSystem fileSystem;
														
 
															+
														
 
															+  /**
														
 
															+   * Has a call to FileSystem.msync() failed as unsupported?
														
 
															+   * If so, no new attempts will be made when
														
 
															+   * (@link {@link #msync(Path)} is invoked.
														
 
															+   */
														
 
															+  private boolean msyncUnsupported = false;
														
 
															+
														
 
															+  /**
														
 
															+   * Direct Constructor.
														
 
															+   * @param fileSystem filesystem to write through.
														
 
															+   */
														
 
															+  public ManifestStoreOperationsThroughFileSystem(final FileSystem fileSystem) {
														
 
															+    this.fileSystem = fileSystem;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Constructor used for introspection-based binding.
														
 
															+   */
														
 
															+  public ManifestStoreOperationsThroughFileSystem() {
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void close() throws IOException {
														
 
															+    /* no-op; FS is assumed to be shared. */
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the filesystem.
														
 
															+   * @return the filesystem; null until bound.
														
 
															+   */
														
 
															+  public FileSystem getFileSystem() {
														
 
															+    return fileSystem;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void bindToFileSystem(FileSystem filesystem, Path path) throws IOException {
														
 
															+    fileSystem = filesystem;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public FileStatus getFileStatus(Path path) throws IOException {
														
 
															+    return fileSystem.getFileStatus(path);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Using FileSystem.isFile to offer stores the option to optimize their probes.
														
 
															+   * @param path path to probe
														
 
															+   * @return true if the path resolves to a file.
														
 
															+   * @throws IOException IO failure.
														
 
															+   */
														
 
															+  @SuppressWarnings("deprecation")
														
 
															+  @Override
														
 
															+  public boolean isFile(Path path) throws IOException {
														
 
															+    return fileSystem.isFile(path);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean delete(Path path, boolean recursive)
														
 
															+      throws IOException {
														
 
															+    return fileSystem.delete(path, recursive);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean mkdirs(Path path)
														
 
															+      throws IOException {
														
 
															+    return fileSystem.mkdirs(path);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean renameFile(Path source, Path dest)
														
 
															+      throws IOException {
														
 
															+    return fileSystem.rename(source, dest);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public RemoteIterator<FileStatus> listStatusIterator(Path path)
														
 
															+      throws IOException {
														
 
															+    return fileSystem.listStatusIterator(path);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public TaskManifest loadTaskManifest(
														
 
															+      JsonSerialization<TaskManifest> serializer,
														
 
															+      FileStatus st) throws IOException {
														
 
															+    return TaskManifest.load(serializer, fileSystem, st.getPath(), st);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public <T extends AbstractManifestData<T>> void save(
														
 
															+      final T manifestData,
														
 
															+      final Path path,
														
 
															+      final boolean overwrite) throws IOException {
														
 
															+    manifestData.save(fileSystem, path, overwrite);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Probe filesystem capabilities.
														
 
															+   * @param path path to probe.
														
 
															+   * @return true if the FS declares its renames work.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public boolean storePreservesEtagsThroughRenames(Path path) {
														
 
															+    try {
														
 
															+      return fileSystem.hasPathCapability(path,
														
 
															+          CommonPathCapabilities.ETAGS_PRESERVED_IN_RENAME);
														
 
															+    } catch (IOException ignored) {
														
 
															+      return false;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Invokes FileSystem msync(); swallows UnsupportedOperationExceptions.
														
 
															+   * This ensures client metadata caches are in sync in an HDFS-HA deployment.
														
 
															+   * No other filesystems support this; in the absence of a hasPathCapability()
														
 
															+   * probe, after the operation is rejected, an atomic boolean is set
														
 
															+   * to stop further attempts from even trying.
														
 
															+   * @param path path
														
 
															+   * @throws IOException failure to synchronize.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public void msync(Path path) throws IOException {
														
 
															+    // there's need for atomicity here, as the sole cost of
														
 
															+    // multiple failures
														
 
															+    if (msyncUnsupported) {
														
 
															+      return;
														
 
															+    }
														
 
															+    // qualify so we can be confident that the FS being synced
														
 
															+    // is the one we expect.
														
 
															+    fileSystem.makeQualified(path);
														
 
															+    try {
														
 
															+      fileSystem.msync();
														
 
															+    } catch (UnsupportedOperationException ignored) {
														
 
															+      // this exception is the default.
														
 
															+      // set the unsupported flag so no future attempts are made.
														
 
															+      msyncUnsupported = true;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/OutputValidationException.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/OutputValidationException.java
@@ -0,0 +1,40 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.PathIOException;
														
 
															+
														
 
															+/**
														
 
															+ * Exception raised during validation.
														
 
															+ * This can be treated differently from other outcomes.
														
 
															+ */
														
 
															+@InterfaceAudience.Private
														
 
															+public class OutputValidationException extends PathIOException {
														
 
															+  public OutputValidationException(Path path, String error) {
														
 
															+    super(path.toUri().toString(), error);
														
 
															+  }
														
 
															+
														
 
															+  public OutputValidationException(Path path,
														
 
															+      String error,
														
 
															+      Throwable cause) {
														
 
															+    super(path.toUri().toString(), error, cause);
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/package-info.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/package-info.java
@@ -0,0 +1,30 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+/**
														
 
															+ * Implementation classes for the manifest committer.
														
 
															+ * Nothing outside this module should be using these classes,
														
 
															+ * except where explicitly stated.
														
 
															+ */
														
 
															+
														
 
															+@InterfaceAudience.Private
														
 
															+@InterfaceStability.Unstable
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/package-info.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/package-info.java
@@ -0,0 +1,34 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+/**
														
 
															+ * Intermediate manifest committer.
														
 
															+ *
														
 
															+ * Optimized for object stores where listing is slow, directory renames may not
														
 
															+ * be atomic, and the output is a deep tree of files intermixed with
														
 
															+ * the output of (many) other task attempts.
														
 
															+ *
														
 
															+ * All classes in this module are private/unstable, except where stated.
														
 
															+ */
														
 
															+
														
 
															+@InterfaceAudience.Private
														
 
															+@InterfaceStability.Unstable
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbortTaskStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbortTaskStage.java
@@ -0,0 +1,63 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_ABORT_TASK;
														
 
															+
														
 
															+/**
														
 
															+ * Abort a task.
														
 
															+ *
														
 
															+ * This is done by deleting the task directory.
														
 
															+ * Exceptions may/may not be suppressed.
														
 
															+ */
														
 
															+public class AbortTaskStage extends
														
 
															+    AbstractJobOrTaskStage<Boolean, Path> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      AbortTaskStage.class);
														
 
															+
														
 
															+  public AbortTaskStage(final StageConfig stageConfig) {
														
 
															+    super(true, stageConfig, OP_STAGE_TASK_ABORT_TASK, false);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Delete the task attempt directory.
														
 
															+   * @param suppressExceptions should exceptions be ignored?
														
 
															+   * @return the directory
														
 
															+   * @throws IOException failure when exceptions were not suppressed
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected Path executeStage(final Boolean suppressExceptions)
														
 
															+      throws IOException {
														
 
															+    final Path dir = getTaskAttemptDir();
														
 
															+    if (dir != null) {
														
 
															+      LOG.info("{}: Deleting task attempt directory {}", getName(), dir);
														
 
															+      deleteDir(dir, suppressExceptions);
														
 
															+    }
														
 
															+    return dir;
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbstractJobOrTaskStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbstractJobOrTaskStage.java
@@ -0,0 +1,942 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.FileNotFoundException;
														
 
															+import java.io.IOException;
														
 
															+import java.time.Duration;
														
 
															+import java.util.concurrent.atomic.AtomicBoolean;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.FileAlreadyExistsException;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.PathIOException;
														
 
															+import org.apache.hadoop.fs.RemoteIterator;
														
 
															+import org.apache.hadoop.fs.statistics.DurationTracker;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
														
 
															+import org.apache.hadoop.util.OperationDuration;
														
 
															+import org.apache.hadoop.util.Preconditions;
														
 
															+import org.apache.hadoop.util.functional.CallableRaisingIOE;
														
 
															+import org.apache.hadoop.util.functional.RemoteIterators;
														
 
															+import org.apache.hadoop.util.functional.TaskPool;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_GET_FILE_STATUS;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_IS_FILE;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_LIST_STATUS;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_MKDIRS;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.STORE_IO_RATE_LIMITED;
														
 
															+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.createTracker;
														
 
															+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration;
														
 
															+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfInvocation;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_SUFFIX;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME_RECOVERED;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_LOAD_MANIFEST;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_MSYNC;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_RENAME_FILE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_SAVE_TASK_MANIFEST;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration.enterStageWorker;
														
 
															+
														
 
															+/**
														
 
															+ * A Stage in Task/Job Commit.
														
 
															+ * A stage can be executed once only, creating the return value of the
														
 
															+ * {@link #apply(Object)} method, and, potentially, updating the state of the
														
 
															+ * store via {@link ManifestStoreOperations}.
														
 
															+ * IOStatistics will also be updated.
														
 
															+ * Stages are expected to be combined to form the commit protocol.
														
 
															+ * @param <IN> Type of arguments to the stage.
														
 
															+ * @param <OUT> Type of result.
														
 
															+ */
														
 
															+public abstract class AbstractJobOrTaskStage<IN, OUT>
														
 
															+    implements JobOrTaskStage<IN, OUT> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      AbstractJobOrTaskStage.class);
														
 
															+
														
 
															+  /**
														
 
															+   * Error text on rename failure: {@value}.
														
 
															+   */
														
 
															+  public static final String FAILED_TO_RENAME_PREFIX = "Failed to ";
														
 
															+
														
 
															+  /**
														
 
															+   * Is this a task stage? If so, toString() includes task
														
 
															+   * info..
														
 
															+   */
														
 
															+  private final boolean isTaskStage;
														
 
															+
														
 
															+  /**
														
 
															+   * Configuration of all the stages in the ongoing committer
														
 
															+   * operation.
														
 
															+   */
														
 
															+  private final StageConfig stageConfig;
														
 
															+
														
 
															+  /**
														
 
															+   * Name of the stage for statistics and logging.
														
 
															+   */
														
 
															+  private final String stageStatisticName;
														
 
															+
														
 
															+  /**
														
 
															+   * Callbacks to update store.
														
 
															+   * This is not made visible to the stages; they must
														
 
															+   * go through the wrapper classes in this class, which
														
 
															+   * add statistics and logging.
														
 
															+   */
														
 
															+  private final ManifestStoreOperations operations;
														
 
															+
														
 
															+  /**
														
 
															+   * Submitter for doing IO against the store.
														
 
															+   */
														
 
															+  private final TaskPool.Submitter ioProcessors;
														
 
															+
														
 
															+  /**
														
 
															+   * Used to stop any re-entrancy of the rename.
														
 
															+   * This is an execute-once operation.
														
 
															+   */
														
 
															+  private final AtomicBoolean executed = new AtomicBoolean(false);
														
 
															+
														
 
															+  /**
														
 
															+   * Tracker of the duration of the execution of the stage.
														
 
															+   * set after {@link #executeStage(Object)} completes.
														
 
															+   */
														
 
															+  private DurationTracker stageExecutionTracker;
														
 
															+
														
 
															+  /**
														
 
															+   * Name for logging.
														
 
															+   */
														
 
															+  private final String name;
														
 
															+
														
 
															+  /**
														
 
															+   * Constructor.
														
 
															+   * @param isTaskStage Is this a task stage?
														
 
															+   * @param stageConfig stage-independent configuration.
														
 
															+   * @param stageStatisticName name of the stage for statistics/logging
														
 
															+   * @param requireIOProcessors are the IO processors required?
														
 
															+   */
														
 
															+  protected AbstractJobOrTaskStage(
														
 
															+      final boolean isTaskStage,
														
 
															+      final StageConfig stageConfig,
														
 
															+      final String stageStatisticName,
														
 
															+      final boolean requireIOProcessors) {
														
 
															+    this.isTaskStage = isTaskStage;
														
 
															+    this.stageStatisticName = stageStatisticName;
														
 
															+    this.stageConfig = stageConfig;
														
 
															+    requireNonNull(stageConfig.getDestinationDir(), "Destination Directory");
														
 
															+    requireNonNull(stageConfig.getJobId(), "Job ID");
														
 
															+    requireNonNull(stageConfig.getJobAttemptDir(), "Job attempt directory");
														
 
															+    this.operations = requireNonNull(stageConfig.getOperations(),
														
 
															+        "Operations callbacks");
														
 
															+    // and the processors of work if required.
														
 
															+    this.ioProcessors = bindProcessor(
														
 
															+        requireIOProcessors,
														
 
															+        stageConfig.getIoProcessors());
														
 
															+    String stageName;
														
 
															+    if (isTaskStage) {
														
 
															+      // force fast failure.
														
 
															+      getRequiredTaskId();
														
 
															+      getRequiredTaskAttemptId();
														
 
															+      getRequiredTaskAttemptDir();
														
 
															+      stageName = String.format("[Task-Attempt %s]", getRequiredTaskAttemptId());
														
 
															+    } else  {
														
 
															+      stageName = String.format("[Job-Attempt %s/%02d]",
														
 
															+          stageConfig.getJobId(),
														
 
															+          stageConfig.getJobAttemptNumber());
														
 
															+    }
														
 
															+    name = stageName;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Bind to the processor if it is required.
														
 
															+   * @param required is the processor required?
														
 
															+   * @param processor processor
														
 
															+   * @return the processor binding
														
 
															+   * @throws NullPointerException if required == true and processor is null.
														
 
															+   */
														
 
															+  private TaskPool.Submitter bindProcessor(
														
 
															+      final boolean required,
														
 
															+      final TaskPool.Submitter processor) {
														
 
															+    return required
														
 
															+        ? requireNonNull(processor, "required IO processor is null")
														
 
															+        : null;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Stage entry point.
														
 
															+   * Verifies that this is the first and only time the stage is invoked,
														
 
															+   * then calls {@link #executeStage(Object)} for the subclass
														
 
															+   * to perform its part of the commit protocol.
														
 
															+   * The duration of the stage is collected as a statistic, and its
														
 
															+   * entry/exit logged at INFO.
														
 
															+   * @param arguments arguments to the function.
														
 
															+   * @return the result.
														
 
															+   * @throws IOException failures.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public final OUT apply(final IN arguments) throws IOException {
														
 
															+    executeOnlyOnce();
														
 
															+    progress();
														
 
															+    String stageName = getStageName(arguments);
														
 
															+    getStageConfig().enterStage(stageName);
														
 
															+    String statisticName = getStageStatisticName(arguments);
														
 
															+    // The tracker here
														
 
															+    LOG.info("{}: Executing Stage {}", getName(), stageName);
														
 
															+    stageExecutionTracker = createTracker(getIOStatistics(), statisticName);
														
 
															+    try {
														
 
															+      // exec the input function and return its value
														
 
															+      final OUT out = executeStage(arguments);
														
 
															+      LOG.info("{}: Stage {} completed after {}",
														
 
															+          getName(),
														
 
															+          stageName,
														
 
															+          OperationDuration.humanTime(
														
 
															+              stageExecutionTracker.asDuration().toMillis()));
														
 
															+      return out;
														
 
															+    } catch (IOException | RuntimeException e) {
														
 
															+      LOG.error("{}: Stage {} failed: after {}: {}",
														
 
															+          getName(),
														
 
															+          stageName,
														
 
															+          OperationDuration.humanTime(
														
 
															+              stageExecutionTracker.asDuration().toMillis()),
														
 
															+          e.toString());
														
 
															+      LOG.debug("{}: Stage failure:", getName(), e);
														
 
															+      // input function failed: note it
														
 
															+      stageExecutionTracker.failed();
														
 
															+      // and rethrow
														
 
															+      throw e;
														
 
															+    } finally {
														
 
															+      // update the tracker.
														
 
															+      // this is called after the catch() call would have
														
 
															+      // set the failed flag.
														
 
															+      stageExecutionTracker.close();
														
 
															+      progress();
														
 
															+      getStageConfig().exitStage(stageName);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * The work of a stage.
														
 
															+   * Executed exactly once.
														
 
															+   * @param arguments arguments to the function.
														
 
															+   * @return the result.
														
 
															+   * @throws IOException failures.
														
 
															+   */
														
 
															+  protected abstract OUT executeStage(IN arguments) throws IOException;
														
 
															+
														
 
															+  /**
														
 
															+   * Check that the operation has not been invoked twice.
														
 
															+   * This is an atomic check.
														
 
															+   * @throws IllegalStateException on a second invocation.
														
 
															+   */
														
 
															+  private void executeOnlyOnce() {
														
 
															+    Preconditions.checkState(
														
 
															+        !executed.getAndSet(true),
														
 
															+        "Stage attempted twice");
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * The stage statistic name.
														
 
															+   * @param arguments args to the invocation.
														
 
															+   * @return stage name.
														
 
															+   */
														
 
															+  protected String getStageStatisticName(IN arguments) {
														
 
															+    return stageStatisticName;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Stage name for reporting; defaults to
														
 
															+   * call {@link #getStageStatisticName(IN)}.
														
 
															+   * @param arguments args to the invocation.
														
 
															+   * @return name used in updating reports.
														
 
															+   */
														
 
															+  protected String getStageName(IN arguments) {
														
 
															+    return getStageStatisticName(arguments);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the execution tracker; non-null
														
 
															+   * after stage execution.
														
 
															+   * @return a tracker or null.
														
 
															+   */
														
 
															+  public DurationTracker getStageExecutionTracker() {
														
 
															+    return stageExecutionTracker;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Adds the duration of the job to an IOStatistics store
														
 
															+   * (such as the manifest to be saved).
														
 
															+   * @param iostats store
														
 
															+   * @param statistic statistic name.
														
 
															+   */
														
 
															+  public void addExecutionDurationToStatistics(IOStatisticsStore iostats,
														
 
															+      String statistic) {
														
 
															+    iostats.addTimedOperation(
														
 
															+        statistic,
														
 
															+        getStageExecutionTracker().asDuration());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Note any rate limiting to the given timing statistic.
														
 
															+   * If the wait was 0, no statistics are updated.
														
 
															+   * @param statistic statistic key.
														
 
															+   * @param wait wait duration.
														
 
															+   */
														
 
															+  private void noteAnyRateLimiting(String statistic, Duration wait) {
														
 
															+    if (!wait.isZero()) {
														
 
															+      // rate limiting took place
														
 
															+      getIOStatistics().addTimedOperation(
														
 
															+          statistic,
														
 
															+          wait.toMillis());
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public String toString() {
														
 
															+    final StringBuilder sb = new StringBuilder(
														
 
															+        "AbstractJobOrTaskStage{");
														
 
															+    sb.append(isTaskStage ? "Task Stage" : "Job Stage");
														
 
															+    sb.append(" name='").append(name).append('\'');
														
 
															+    sb.append(" stage='").append(stageStatisticName).append('\'');
														
 
															+    sb.append('}');
														
 
															+    return sb.toString();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * The stage configuration.
														
 
															+   * @return the stage configuration used by this stage.
														
 
															+   */
														
 
															+  protected StageConfig getStageConfig() {
														
 
															+    return stageConfig;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Update the thread context with the stage name and
														
 
															+   * job ID.
														
 
															+   * This MUST be invoked at the start of methods invoked in helper threads,
														
 
															+   * to ensure that they are all annotated with job and stage.
														
 
															+   * @param stage stage name.
														
 
															+   */
														
 
															+  protected void updateAuditContext(final String stage) {
														
 
															+    enterStageWorker(stageConfig.getJobId(), stage);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * The IOStatistics are shared across all uses of the
														
 
															+   * StageConfig.
														
 
															+   * @return the (possibly shared) IOStatistics.
														
 
															+   */
														
 
															+  @Override
														
 
															+  public final IOStatisticsStore getIOStatistics() {
														
 
															+    return stageConfig.getIOStatistics();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Call progress() on any Progressable passed in.
														
 
															+   */
														
 
															+  protected final void progress() {
														
 
															+    if (stageConfig.getProgressable() != null) {
														
 
															+      stageConfig.getProgressable().progress();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get a file status value or, if the path doesn't exist, return null.
														
 
															+   * @param path path
														
 
															+   * @return status or null
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  protected final FileStatus getFileStatusOrNull(
														
 
															+      final Path path)
														
 
															+      throws IOException {
														
 
															+    try {
														
 
															+      return getFileStatus(path);
														
 
															+    } catch (FileNotFoundException e) {
														
 
															+      return null;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get a file status value or, if the path doesn't exist, return null.
														
 
															+   * @param path path
														
 
															+   * @return status or null
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  protected final FileStatus getFileStatus(
														
 
															+      final Path path)
														
 
															+      throws IOException {
														
 
															+    LOG.trace("{}: getFileStatus('{}')", getName(), path);
														
 
															+    requireNonNull(path,
														
 
															+        () -> String.format("%s: Null path for getFileStatus() call", getName()));
														
 
															+    return trackDuration(getIOStatistics(), OP_GET_FILE_STATUS, () ->
														
 
															+        operations.getFileStatus(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get a file status value or, if the path doesn't exist, return null.
														
 
															+   * @param path path
														
 
															+   * @return true if the path resolves to a file
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  protected final boolean isFile(
														
 
															+      final Path path)
														
 
															+      throws IOException {
														
 
															+    LOG.trace("{}: isFile('{}')", getName(), path);
														
 
															+    return trackDuration(getIOStatistics(), OP_IS_FILE, () -> {
														
 
															+      return operations.isFile(path);
														
 
															+    });
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Delete a path.
														
 
															+   * @param path path
														
 
															+   * @param recursive recursive delete.
														
 
															+   * @return status or null
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  protected final boolean delete(
														
 
															+      final Path path,
														
 
															+      final boolean recursive)
														
 
															+      throws IOException {
														
 
															+    LOG.trace("{}: delete('{}, {}')", getName(), path, recursive);
														
 
															+    return delete(path, recursive, OP_DELETE);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Delete a path.
														
 
															+   * @param path path
														
 
															+   * @param recursive recursive delete.
														
 
															+   * @param statistic statistic to update
														
 
															+   * @return status or null
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  protected Boolean delete(
														
 
															+      final Path path,
														
 
															+      final boolean recursive,
														
 
															+      final String statistic)
														
 
															+      throws IOException {
														
 
															+    return trackDuration(getIOStatistics(), statistic, () -> {
														
 
															+      return operations.delete(path, recursive);
														
 
															+    });
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a directory.
														
 
															+   * @param path path
														
 
															+   * @param escalateFailure escalate "false" to PathIOE
														
 
															+   * @return true if the directory was created/exists.
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  protected final boolean mkdirs(
														
 
															+      final Path path,
														
 
															+      final boolean escalateFailure)
														
 
															+      throws IOException {
														
 
															+    LOG.trace("{}: mkdirs('{}')", getName(), path);
														
 
															+    return trackDuration(getIOStatistics(), OP_MKDIRS, () -> {
														
 
															+      boolean success = operations.mkdirs(path);
														
 
															+      if (!success && escalateFailure) {
														
 
															+        throw new PathIOException(path.toUri().toString(),
														
 
															+            stageStatisticName + ": mkdirs() returned false");
														
 
															+      }
														
 
															+      return success;
														
 
															+    });
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * List all directly files under a path.
														
 
															+   * Async implementations may under-report their durations.
														
 
															+   * @param path path
														
 
															+   * @return iterator over the results.
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  protected final RemoteIterator<FileStatus> listStatusIterator(
														
 
															+      final Path path)
														
 
															+      throws IOException {
														
 
															+    LOG.trace("{}: listStatusIterator('{}')", getName(), path);
														
 
															+    return trackDuration(getIOStatistics(), OP_LIST_STATUS, () ->
														
 
															+        operations.listStatusIterator(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load a manifest file.
														
 
															+   * @param status source.
														
 
															+   * @return the manifest.
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  protected final TaskManifest loadManifest(
														
 
															+      final FileStatus status)
														
 
															+      throws IOException {
														
 
															+    LOG.trace("{}: loadManifest('{}')", getName(), status);
														
 
															+    return trackDuration(getIOStatistics(), OP_LOAD_MANIFEST, () ->
														
 
															+        operations.loadTaskManifest(
														
 
															+            stageConfig.currentManifestSerializer(),
														
 
															+            status));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * List all the manifests in the task manifest dir.
														
 
															+   * @return a iterator of manifests.
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  protected final RemoteIterator<FileStatus> listManifests()
														
 
															+      throws IOException {
														
 
															+    return RemoteIterators.filteringRemoteIterator(
														
 
															+        listStatusIterator(getTaskManifestDir()),
														
 
															+        st -> st.getPath().toUri().toString().endsWith(MANIFEST_SUFFIX));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Make an msync() call; swallow when unsupported.
														
 
															+   * @param path path
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  protected final void msync(Path path) throws IOException {
														
 
															+    LOG.trace("{}: msync('{}')", getName(), path);
														
 
															+    trackDurationOfInvocation(getIOStatistics(), OP_MSYNC, () ->
														
 
															+        operations.msync(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a directory -failing if it exists or if
														
 
															+   * mkdirs() failed.
														
 
															+   * @param operation operation for error reporting.
														
 
															+   * @param path path path to create.
														
 
															+   * @return the path.
														
 
															+   * @throws IOException failure
														
 
															+   * @throws PathIOException mkdirs failed.
														
 
															+   * @throws FileAlreadyExistsException destination exists.
														
 
															+   */
														
 
															+  protected final Path createNewDirectory(
														
 
															+      final String operation,
														
 
															+      final Path path) throws IOException {
														
 
															+    LOG.trace("{}: {} createNewDirectory('{}')", getName(), operation, path);
														
 
															+    requireNonNull(path,
														
 
															+        () -> String.format("%s: Null path for operation %s", getName(), operation));
														
 
															+    // check for dir existence before trying to create.
														
 
															+    try {
														
 
															+      final FileStatus status = getFileStatus(path);
														
 
															+      // no exception, so the path exists.
														
 
															+      throw new FileAlreadyExistsException(operation
														
 
															+          + ": path " + path
														
 
															+          + " already exists and has status " + status);
														
 
															+    } catch (FileNotFoundException e) {
														
 
															+      // the path does not exist, so create it.
														
 
															+      mkdirs(path, true);
														
 
															+      return path;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that a path is a directory which must exist.
														
 
															+   * @param operation operation for error reporting.
														
 
															+   * @param path path path to create.
														
 
															+   * @return the path
														
 
															+   * @throws IOException failure
														
 
															+   * @throws PathIOException mkdirs failed.
														
 
															+   * @throws FileAlreadyExistsException destination exists.
														
 
															+   */
														
 
															+  protected final Path directoryMustExist(
														
 
															+      final String operation,
														
 
															+      final Path path) throws IOException {
														
 
															+    final FileStatus status = getFileStatus(path);
														
 
															+    if (!status.isDirectory()) {
														
 
															+      throw new PathIOException(path.toString(),
														
 
															+          operation
														
 
															+              + ": Path is not a directory; its status is :" + status);
														
 
															+    }
														
 
															+    return path;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Save a task manifest or summary. This will be done by
														
 
															+   * writing to a temp path and then renaming.
														
 
															+   * If the destination path exists: Delete it.
														
 
															+   * @param manifestData the manifest/success file
														
 
															+   * @param tempPath temp path for the initial save
														
 
															+   * @param finalPath final path for rename.
														
 
															+   * @throws IOException failure to load/parse
														
 
															+   */
														
 
															+  @SuppressWarnings("unchecked")
														
 
															+  protected final <T extends AbstractManifestData> void save(T manifestData,
														
 
															+      final Path tempPath,
														
 
															+      final Path finalPath) throws IOException {
														
 
															+    LOG.trace("{}: save('{}, {}, {}')", getName(), manifestData, tempPath, finalPath);
														
 
															+    trackDurationOfInvocation(getIOStatistics(), OP_SAVE_TASK_MANIFEST, () ->
														
 
															+        operations.save(manifestData, tempPath, true));
														
 
															+    renameFile(tempPath, finalPath);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get an etag from a FileStatus which MUST BE
														
 
															+   * an implementation of EtagSource and
														
 
															+   * whose etag MUST NOT BE null/empty.
														
 
															+   * @param status the status; may be null.
														
 
															+   * @return the etag or null if not provided
														
 
															+   */
														
 
															+  public String getEtag(FileStatus status) {
														
 
															+    return operations.getEtag(status);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Rename a file from source to dest; if the underlying FS API call
														
 
															+   * returned false that's escalated to an IOE.
														
 
															+   * @param source source file.
														
 
															+   * @param dest dest file
														
 
															+   * @throws IOException failure
														
 
															+   * @throws PathIOException if the rename() call returned false.
														
 
															+   */
														
 
															+  protected final void renameFile(final Path source, final Path dest)
														
 
															+      throws IOException {
														
 
															+    maybeDeleteDest(true, dest);
														
 
															+    executeRenamingOperation("renameFile", source, dest,
														
 
															+        OP_RENAME_FILE, () ->
														
 
															+            operations.renameFile(source, dest));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Rename a file from source to dest; if the underlying FS API call
														
 
															+   * returned false that's escalated to an IOE.
														
 
															+   * @param source source file.
														
 
															+   * @param dest dest file
														
 
															+   * @throws IOException failure
														
 
															+   * @throws PathIOException if the rename() call returned false.
														
 
															+   */
														
 
															+  protected final void renameDir(final Path source, final Path dest)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    maybeDeleteDest(true, dest);
														
 
															+    executeRenamingOperation("renameDir", source, dest,
														
 
															+        OP_RENAME_FILE, () ->
														
 
															+        operations.renameDir(source, dest)
														
 
															+    );
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Commit a file from the manifest using rename or, if available, resilient renaming.
														
 
															+   * @param entry entry from manifest
														
 
															+   * @throws PathIOException if the rename() call returned false and was uprated.
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  protected final CommitOutcome commitFile(FileEntry entry,
														
 
															+      boolean deleteDest)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    final Path source = entry.getSourcePath();
														
 
															+    final Path dest = entry.getDestPath();
														
 
															+
														
 
															+    maybeDeleteDest(deleteDest, dest);
														
 
															+    if (storeSupportsResilientCommit()) {
														
 
															+      // get the commit permits
														
 
															+      final ManifestStoreOperations.CommitFileResult result = trackDuration(getIOStatistics(),
														
 
															+          OP_COMMIT_FILE_RENAME, () ->
														
 
															+              operations.commitFile(entry));
														
 
															+      if (result.recovered()) {
														
 
															+        // recovery took place.
														
 
															+        getIOStatistics().incrementCounter(OP_COMMIT_FILE_RENAME_RECOVERED);
														
 
															+      }
														
 
															+      if (result.getWaitTime() != null) {
														
 
															+        // note any delay which took place
														
 
															+        noteAnyRateLimiting(STORE_IO_RATE_LIMITED, result.getWaitTime());
														
 
															+      }
														
 
															+    } else {
														
 
															+      // commit with a simple rename; failures will be escalated.
														
 
															+      executeRenamingOperation("renameFile", source, dest,
														
 
															+          OP_COMMIT_FILE_RENAME, () ->
														
 
															+              operations.renameFile(source, dest));
														
 
															+    }
														
 
															+    return new CommitOutcome();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Does this store support resilient commit.
														
 
															+   * @return true if resilient commit operations are available.
														
 
															+   */
														
 
															+  protected boolean storeSupportsResilientCommit() {
														
 
															+    return operations.storeSupportsResilientCommit();
														
 
															+  }
														
 
															+
														
 
															+  private void maybeDeleteDest(final boolean deleteDest, final Path dest) throws IOException {
														
 
															+    if (deleteDest) {
														
 
															+      // delete the destination, always, knowing that it's a no-op if
														
 
															+      // the data isn't there. Skipping the change saves one round trip
														
 
															+      // to actually look for the file/object
														
 
															+      boolean deleted = delete(dest, true);
														
 
															+      // log the outcome in case of emergency diagnostics traces
														
 
															+      // being needed.
														
 
															+      LOG.debug("{}: delete('{}') returned {}'", getName(), dest, deleted);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute an operation to rename a file/dir, commit a manifest entry.
														
 
															+   * The statistic is tracked; returning false from the operation is considered
														
 
															+   * a failure from the statistics perspective.
														
 
															+   * @param operation operation name
														
 
															+   * @param source source path
														
 
															+   * @param dest dest path
														
 
															+   * @param statistic statistic to track
														
 
															+   * @param action callable of the operation
														
 
															+   * @throws IOException on any failure
														
 
															+   */
														
 
															+  private void executeRenamingOperation(String operation,
														
 
															+      Path source,
														
 
															+      Path dest,
														
 
															+      String statistic,
														
 
															+      CallableRaisingIOE<Boolean> action) throws IOException {
														
 
															+
														
 
															+    LOG.debug("{}: {} '{}' to '{}')", getName(), operation, source, dest);
														
 
															+    requireNonNull(source, "Null source");
														
 
															+    requireNonNull(dest, "Null dest");
														
 
															+
														
 
															+    // duration tracking is a bit convoluted as it
														
 
															+    // ensures that rename failures as well as IOEs are
														
 
															+    // treated as failures from a statistics perspective.
														
 
															+
														
 
															+    DurationTracker tracker = createTracker(getIOStatistics(), statistic);
														
 
															+    boolean success;
														
 
															+    try {
														
 
															+      success = action.apply();
														
 
															+      if (!success) {
														
 
															+        // record failure in the tracker before closing it
														
 
															+        tracker.failed();
														
 
															+      }
														
 
															+    } catch (IOException | RuntimeException e) {
														
 
															+      LOG.info("{}: {} raised an exception: {}", getName(), operation, e.toString());
														
 
															+      LOG.debug("{}: {} stack trace", getName(), operation, e);
														
 
															+      tracker.failed();
														
 
															+      throw e;
														
 
															+    } finally {
														
 
															+      // success
														
 
															+      // update the tracker.
														
 
															+      tracker.close();
														
 
															+    }
														
 
															+    // escalate the failure; this is done out of the duration tracker
														
 
															+    // so its file status probes aren't included.
														
 
															+    if (!success) {
														
 
															+      throw escalateRenameFailure(operation, source, dest);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Escalate a rename failure to an exception.
														
 
															+   * Returns an error exception to throw if one was not
														
 
															+   * triggered when probing for the source.
														
 
															+   * @param operation operation name
														
 
															+   * @param source source path
														
 
															+   * @param dest dest path
														
 
															+   * @return an exception to throw
														
 
															+   * @throws IOException raised probing for source or dest
														
 
															+   */
														
 
															+  private PathIOException escalateRenameFailure(String operation,
														
 
															+      Path source, Path dest) throws IOException {
														
 
															+    // rename just returned false.
														
 
															+    // collect information for a meaningful error message
														
 
															+    // and include in an exception raised.
														
 
															+
														
 
															+    // get the source status; this will implicitly raise a FNFE.
														
 
															+    final FileStatus sourceStatus = getFileStatus(source);
														
 
															+
														
 
															+    // and look to see if there is anything at the destination
														
 
															+    final FileStatus destStatus = getFileStatusOrNull(dest);
														
 
															+
														
 
															+    LOG.error("{}: failure to {} {} to {} with" +
														
 
															+            " source status {} " +
														
 
															+            " and destination status {}",
														
 
															+        getName(), operation, source, dest,
														
 
															+        sourceStatus, destStatus);
														
 
															+
														
 
															+    return new PathIOException(source.toString(),
														
 
															+        FAILED_TO_RENAME_PREFIX + operation + " to " + dest);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Outcome from the commit.
														
 
															+   */
														
 
															+  public static final class CommitOutcome {
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Job ID: never null.
														
 
															+   */
														
 
															+  protected final String getJobId() {
														
 
															+    return stageConfig.getJobId();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Job attempt number.
														
 
															+   */
														
 
															+  protected final int getJobAttemptNumber() {
														
 
															+    return stageConfig.getJobAttemptNumber();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * ID of the task.
														
 
															+   */
														
 
															+  protected final String getTaskId() {
														
 
															+    return stageConfig.getTaskId();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the task ID; raise an NPE
														
 
															+   * if it is null.
														
 
															+   * @return a non-null task ID.
														
 
															+   */
														
 
															+  protected final String getRequiredTaskId() {
														
 
															+    return requireNonNull(getTaskId(),
														
 
															+        "No Task ID in stage config");
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * ID of this specific attempt at a task.
														
 
															+   */
														
 
															+  protected final String getTaskAttemptId() {
														
 
															+    return stageConfig.getTaskAttemptId();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the task attempt ID; raise an NPE
														
 
															+   * if it is null.
														
 
															+   * @return a non-null task attempt ID.
														
 
															+   */
														
 
															+  protected final String getRequiredTaskAttemptId() {
														
 
															+    return requireNonNull(getTaskAttemptId(),
														
 
															+        "No Task Attempt ID in stage config");
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Job attempt dir.
														
 
															+   */
														
 
															+  protected final Path getJobAttemptDir() {
														
 
															+    return stageConfig.getJobAttemptDir();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Directory to put task manifests into.
														
 
															+   * @return a path under the job attempt dir.
														
 
															+   */
														
 
															+  protected final Path getTaskManifestDir() {
														
 
															+    return stageConfig.getTaskManifestDir();
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Task attempt dir.
														
 
															+   */
														
 
															+  protected final Path getTaskAttemptDir() {
														
 
															+    return stageConfig.getTaskAttemptDir();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the task attemptDir; raise an NPE
														
 
															+   * if it is null.
														
 
															+   * @return a non-null task attempt dir.
														
 
															+   */
														
 
															+  protected final Path getRequiredTaskAttemptDir() {
														
 
															+    return requireNonNull(getTaskAttemptDir(),
														
 
															+        "No Task Attempt Dir");
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Destination of job.
														
 
															+   */
														
 
															+  protected final Path getDestinationDir() {
														
 
															+    return stageConfig.getDestinationDir();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Stage confog name, for logging.
														
 
															+   * @return name.
														
 
															+   */
														
 
															+  public final String getName() {
														
 
															+    return name;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Submitter for doing IO against the store other than
														
 
															+   * manifest processing.
														
 
															+   */
														
 
															+  protected final TaskPool.Submitter getIOProcessors() {
														
 
															+    return ioProcessors;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Submitter for doing IO against the store other than
														
 
															+   * manifest processing.
														
 
															+   * The size parameter is used to select between sequential
														
 
															+   * and parallel runners.
														
 
															+   * no data, or one entry: serial.
														
 
															+   * everything else, parallel.
														
 
															+   * @param size number of items.
														
 
															+   * @return a submitter or null
														
 
															+   */
														
 
															+  protected final TaskPool.Submitter getIOProcessors(int size) {
														
 
															+    return size > 1
														
 
															+        ? getIOProcessors()
														
 
															+        : null;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Delete a directory, possibly suppressing exceptions.
														
 
															+   * @param dir directory.
														
 
															+   * @param suppressExceptions should exceptions be suppressed?
														
 
															+   * @throws IOException exceptions raised in delete if not suppressed.
														
 
															+   * @return any exception caught and suppressed
														
 
															+   */
														
 
															+  protected IOException deleteDir(
														
 
															+      final Path dir,
														
 
															+      final Boolean suppressExceptions)
														
 
															+      throws IOException {
														
 
															+    try {
														
 
															+      delete(dir, true);
														
 
															+      return null;
														
 
															+    } catch (IOException ex) {
														
 
															+      LOG.info("Error deleting {}: {}", dir, ex.toString());
														
 
															+      if (!suppressExceptions) {
														
 
															+        throw ex;
														
 
															+      } else {
														
 
															+        return ex;
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create an entry for a file to rename under the destination.
														
 
															+   * If the store operations supports extracting etags from file status
														
 
															+   * entries, that is included in the entry
														
 
															+   * @param status source file
														
 
															+   * @param destDir destination directory
														
 
															+   * @return an entry which includes the rename path
														
 
															+   */
														
 
															+  protected FileEntry fileEntry(FileStatus status, Path destDir) {
														
 
															+    // generate a new path under the dest dir
														
 
															+    Path dest = new Path(destDir, status.getPath().getName());
														
 
															+    return new FileEntry(status.getPath(),
														
 
															+        dest,
														
 
															+        status.getLen(),
														
 
															+        getEtag(status));
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java
@@ -0,0 +1,511 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.FileNotFoundException;
														
 
															+import java.io.IOException;
														
 
															+import java.util.concurrent.atomic.AtomicInteger;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.conf.Configuration;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.RemoteIterator;
														
 
															+import org.apache.hadoop.util.DurationInfo;
														
 
															+import org.apache.hadoop.util.functional.RemoteIterators;
														
 
															+import org.apache.hadoop.util.functional.TaskPool;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED_DEFAULT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED_DEFAULT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
														
 
															+
														
 
															+/**
														
 
															+ * Clean up a job's temporary directory through parallel delete,
														
 
															+ * base _temporary delete.
														
 
															+ * Returns: the outcome of the overall operation
														
 
															+ * The result is detailed purely for the benefit of tests, which need
														
 
															+ * to make assertions about error handling and fallbacks.
														
 
															+ *
														
 
															+ * There's a few known issues with the azure and GCS stores which
														
 
															+ * this stage tries to address.
														
 
															+ * - Google GCS directory deletion is O(entries), so is slower for big jobs.
														
 
															+ * - Azure storage directory delete, when using OAuth authentication or
														
 
															+ *   when not the store owner triggers a scan down the tree to verify the
														
 
															+ *   caller has the permission to delete each subdir.
														
 
															+ *   If this scan takes over 90s, the operation can time out.
														
 
															+ *
														
 
															+ * The main solution for both of these is that task attempts are
														
 
															+ * deleted in parallel, in different threads.
														
 
															+ * This will speed up GCS cleanup and reduce the risk of
														
 
															+ * abfs related timeouts.
														
 
															+ * Exceptions during cleanup can be suppressed,
														
 
															+ * so that these do not cause the job to fail.
														
 
															+ *
														
 
															+ * Also, some users want to be able to run multiple independent jobs
														
 
															+ * targeting the same output directory simultaneously.
														
 
															+ * If one job deletes the directory `__temporary` all the others
														
 
															+ * will fail.
														
 
															+ *
														
 
															+ * This can be addressed by disabling cleanup entirely.
														
 
															+ *
														
 
															+ */
														
 
															+public class CleanupJobStage extends
														
 
															+    AbstractJobOrTaskStage<
														
 
															+            CleanupJobStage.Arguments,
														
 
															+            CleanupJobStage.Result> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      CleanupJobStage.class);
														
 
															+
														
 
															+  /**
														
 
															+   * Count of deleted directories.
														
 
															+   */
														
 
															+  private final AtomicInteger deleteDirCount = new AtomicInteger();
														
 
															+
														
 
															+  /**
														
 
															+   * Count of delete failures.
														
 
															+   */
														
 
															+  private final AtomicInteger deleteFailureCount = new AtomicInteger();
														
 
															+
														
 
															+  /**
														
 
															+   * Last delete exception; non null if deleteFailureCount is not zero.
														
 
															+   */
														
 
															+  private IOException lastDeleteException;
														
 
															+
														
 
															+  /**
														
 
															+   * Stage name as passed in from arguments.
														
 
															+   */
														
 
															+  private String stageName = OP_STAGE_JOB_CLEANUP;
														
 
															+
														
 
															+  public CleanupJobStage(final StageConfig stageConfig) {
														
 
															+    super(false, stageConfig, OP_STAGE_JOB_CLEANUP, true);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Statistic name is extracted from the arguments.
														
 
															+   * @param arguments args to the invocation.
														
 
															+   * @return stage name.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected String getStageStatisticName(Arguments arguments) {
														
 
															+    return arguments.statisticName;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Clean up the job attempt directory tree.
														
 
															+   * @param args arguments built up.
														
 
															+   * @return the result.
														
 
															+   * @throws IOException failure was raised an exceptions weren't surpressed.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected Result executeStage(
														
 
															+      final Arguments args)
														
 
															+      throws IOException {
														
 
															+    stageName = getStageName(args);
														
 
															+    // this is $dest/_temporary
														
 
															+    final Path baseDir = requireNonNull(getStageConfig().getOutputTempSubDir());
														
 
															+    LOG.debug("{}: Cleaup of directory {} with {}", getName(), baseDir, args);
														
 
															+    if (!args.enabled) {
														
 
															+      LOG.info("{}: Cleanup of {} disabled", getName(), baseDir);
														
 
															+      return new Result(Outcome.DISABLED, baseDir,
														
 
															+          0, null);
														
 
															+    }
														
 
															+    // shortcut of a single existence check before anything else
														
 
															+    if (getFileStatusOrNull(baseDir) == null) {
														
 
															+      return new Result(Outcome.NOTHING_TO_CLEAN_UP,
														
 
															+          baseDir,
														
 
															+          0, null);
														
 
															+    }
														
 
															+
														
 
															+    Outcome outcome = null;
														
 
															+    IOException exception;
														
 
															+
														
 
															+
														
 
															+    // to delete.
														
 
															+    LOG.info("{}: Deleting job directory {}", getName(), baseDir);
														
 
															+
														
 
															+    if (args.deleteTaskAttemptDirsInParallel) {
														
 
															+      // Attempt to do a parallel delete of task attempt dirs;
														
 
															+      // don't overreact if a delete fails, but stop trying
														
 
															+      // to delete the others, and fall back to deleting the
														
 
															+      // job dir.
														
 
															+      Path taskSubDir
														
 
															+          = getStageConfig().getJobAttemptTaskSubDir();
														
 
															+      try (DurationInfo info = new DurationInfo(LOG,
														
 
															+          "parallel deletion of task attempts in %s",
														
 
															+          taskSubDir)) {
														
 
															+        RemoteIterator<FileStatus> dirs =
														
 
															+            RemoteIterators.filteringRemoteIterator(
														
 
															+                listStatusIterator(taskSubDir),
														
 
															+                FileStatus::isDirectory);
														
 
															+        TaskPool.foreach(dirs)
														
 
															+            .executeWith(getIOProcessors())
														
 
															+            .stopOnFailure()
														
 
															+            .suppressExceptions(false)
														
 
															+            .run(this::rmTaskAttemptDir);
														
 
															+        getIOStatistics().aggregate((retrieveIOStatistics(dirs)));
														
 
															+
														
 
															+        if (getLastDeleteException() != null) {
														
 
															+          // one of the task attempts failed.
														
 
															+          throw getLastDeleteException();
														
 
															+        }
														
 
															+        // success: record this as the outcome.
														
 
															+        outcome = Outcome.PARALLEL_DELETE;
														
 
															+      } catch (FileNotFoundException ex) {
														
 
															+        // not a problem if there's no dir to list.
														
 
															+        LOG.debug("{}: Task attempt dir {} not found", getName(), taskSubDir);
														
 
															+        outcome = Outcome.DELETED;
														
 
															+      } catch (IOException ex) {
														
 
															+        // failure. Log and continue
														
 
															+        LOG.info(
														
 
															+            "{}: Exception while listing/deleting task attempts under {}; continuing",
														
 
															+            getName(),
														
 
															+            taskSubDir, ex);
														
 
															+        // not overreacting here as the base delete will still get executing
														
 
															+        outcome = Outcome.DELETED;
														
 
															+      }
														
 
															+    }
														
 
															+    // Now the top-level deletion; exception gets saved
														
 
															+    exception = deleteOneDir(baseDir);
														
 
															+    if (exception != null) {
														
 
															+      // failure, report and continue
														
 
															+      // assume failure.
														
 
															+      outcome = Outcome.FAILURE;
														
 
															+    } else {
														
 
															+      // if the outcome isn't already recorded as parallel delete,
														
 
															+      // mark is a simple delete.
														
 
															+      if (outcome == null) {
														
 
															+        outcome = Outcome.DELETED;
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    Result result = new Result(
														
 
															+        outcome,
														
 
															+        baseDir,
														
 
															+        deleteDirCount.get(),
														
 
															+        exception);
														
 
															+    if (!result.succeeded() && !args.suppressExceptions) {
														
 
															+      result.maybeRethrowException();
														
 
															+    }
														
 
															+
														
 
															+    return result;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Delete a single TA dir in a parallel task.
														
 
															+   * Updates the audit context.
														
 
															+   * Exceptions are swallowed so that attempts are still made
														
 
															+   * to delete the others, but the first exception
														
 
															+   * caught is saved in a field which can be retrieved
														
 
															+   * via {@link #getLastDeleteException()}.
														
 
															+   *
														
 
															+   * @param status dir to be deleted.
														
 
															+   * @throws IOException delete failure.
														
 
															+   */
														
 
															+  private void rmTaskAttemptDir(FileStatus status) throws IOException {
														
 
															+    // stage name in audit context is the one set in the arguments.
														
 
															+    updateAuditContext(stageName);
														
 
															+    // update the progress callback in case delete is really slow.
														
 
															+    progress();
														
 
															+    deleteOneDir(status.getPath());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Delete a directory.
														
 
															+   * The {@link #deleteFailureCount} counter.
														
 
															+   * is incremented on every failure.
														
 
															+   * @param dir directory
														
 
															+   * @throws IOException if an IOE was raised
														
 
															+   * @return any IOE raised.
														
 
															+   */
														
 
															+  private IOException deleteOneDir(final Path dir)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    deleteDirCount.incrementAndGet();
														
 
															+    IOException ex = deleteDir(dir, true);
														
 
															+    if (ex != null) {
														
 
															+      deleteFailure(ex);
														
 
															+    }
														
 
															+    return ex;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Note a failure.
														
 
															+   * @param ex exception
														
 
															+   */
														
 
															+  private synchronized void deleteFailure(IOException ex) {
														
 
															+    // excaption: add the count
														
 
															+    deleteFailureCount.incrementAndGet();
														
 
															+    lastDeleteException = ex;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the last delete exception; synchronized.
														
 
															+   * @return the last delete exception or null.
														
 
															+   */
														
 
															+  public synchronized IOException getLastDeleteException() {
														
 
															+    return lastDeleteException;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Options to pass down to the cleanup stage.
														
 
															+   */
														
 
															+  public static final class Arguments {
														
 
															+
														
 
															+    /**
														
 
															+     * Statistic to update.
														
 
															+     */
														
 
															+    private final String statisticName;
														
 
															+
														
 
															+    /** Delete is enabled? */
														
 
															+    private final boolean enabled;
														
 
															+
														
 
															+    /** Attempt parallel delete of task attempt dirs? */
														
 
															+    private final boolean deleteTaskAttemptDirsInParallel;
														
 
															+
														
 
															+    /** Ignore failures? */
														
 
															+    private final boolean suppressExceptions;
														
 
															+
														
 
															+    /**
														
 
															+     * Arguments to the stage.
														
 
															+     * @param statisticName stage name to report
														
 
															+     * @param enabled is the stage enabled?
														
 
															+     * @param deleteTaskAttemptDirsInParallel delete task attempt dirs in
														
 
															+     *        parallel?
														
 
															+     * @param suppressExceptions suppress exceptions?
														
 
															+     */
														
 
															+    public Arguments(
														
 
															+        final String statisticName,
														
 
															+        final boolean enabled,
														
 
															+        final boolean deleteTaskAttemptDirsInParallel,
														
 
															+        final boolean suppressExceptions) {
														
 
															+      this.statisticName = statisticName;
														
 
															+      this.enabled = enabled;
														
 
															+      this.deleteTaskAttemptDirsInParallel = deleteTaskAttemptDirsInParallel;
														
 
															+      this.suppressExceptions = suppressExceptions;
														
 
															+    }
														
 
															+
														
 
															+    public String getStatisticName() {
														
 
															+      return statisticName;
														
 
															+    }
														
 
															+
														
 
															+    public boolean isEnabled() {
														
 
															+      return enabled;
														
 
															+    }
														
 
															+
														
 
															+    public boolean isDeleteTaskAttemptDirsInParallel() {
														
 
															+      return deleteTaskAttemptDirsInParallel;
														
 
															+    }
														
 
															+
														
 
															+    public boolean isSuppressExceptions() {
														
 
															+      return suppressExceptions;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public String toString() {
														
 
															+      return "Arguments{" +
														
 
															+          "statisticName='" + statisticName + '\''
														
 
															+          + ", enabled=" + enabled
														
 
															+          + ", deleteTaskAttemptDirsInParallel="
														
 
															+          + deleteTaskAttemptDirsInParallel
														
 
															+          + ", suppressExceptions=" + suppressExceptions
														
 
															+          + '}';
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Static disabled arguments.
														
 
															+   */
														
 
															+  public static final Arguments DISABLED = new Arguments(OP_STAGE_JOB_CLEANUP,
														
 
															+      false,
														
 
															+      false,
														
 
															+      false
														
 
															+  );
														
 
															+
														
 
															+  /**
														
 
															+   * Build an options argument from a configuration, using the
														
 
															+   * settings from FileOutputCommitter and manifest committer.
														
 
															+   * @param statisticName statistic name to use in duration tracking.
														
 
															+   * @param conf configuration to use.
														
 
															+   * @return the options to process
														
 
															+   */
														
 
															+  public static Arguments cleanupStageOptionsFromConfig(
														
 
															+      String statisticName, Configuration conf) {
														
 
															+
														
 
															+    boolean enabled = !conf.getBoolean(FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED,
														
 
															+        FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED_DEFAULT);
														
 
															+    boolean suppressExceptions = conf.getBoolean(
														
 
															+        FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED,
														
 
															+        FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED_DEFAULT);
														
 
															+    boolean deleteTaskAttemptDirsInParallel = conf.getBoolean(
														
 
															+        OPT_CLEANUP_PARALLEL_DELETE,
														
 
															+        OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT);
														
 
															+    return new Arguments(
														
 
															+        statisticName,
														
 
															+        enabled,
														
 
															+        deleteTaskAttemptDirsInParallel,
														
 
															+        suppressExceptions
														
 
															+    );
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Enum of outcomes.
														
 
															+   */
														
 
															+  public enum Outcome {
														
 
															+    DISABLED("Disabled", false),
														
 
															+    NOTHING_TO_CLEAN_UP("Nothing to clean up", true),
														
 
															+    PARALLEL_DELETE("Parallel Delete of Task Attempt Directories", true),
														
 
															+    DELETED("Delete of job directory", true),
														
 
															+    FAILURE("Delete failed", false);
														
 
															+
														
 
															+    private final String description;
														
 
															+
														
 
															+    private final boolean success;
														
 
															+
														
 
															+    Outcome(String description, boolean success) {
														
 
															+      this.description = description;
														
 
															+      this.success = success;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public String toString() {
														
 
															+      return "Outcome{" + name() +
														
 
															+          " '" + description + '\'' +
														
 
															+          "}";
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * description.
														
 
															+     * @return text for logging
														
 
															+     */
														
 
															+    public String getDescription() {
														
 
															+      return description;
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Was this a success?
														
 
															+     * @return true if this outcome is good.
														
 
															+     */
														
 
															+    public boolean isSuccess() {
														
 
															+      return success;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Result of the cleanup.
														
 
															+   * If the outcome == FAILURE but exceptions were suppressed
														
 
															+   * (which they are implicitly if an instance of this object
														
 
															+   * is created and returned), then the exception
														
 
															+   * MUST NOT be null.
														
 
															+   */
														
 
															+  public static final class Result {
														
 
															+
														
 
															+    /** Outcome. */
														
 
															+    private final Outcome outcome;
														
 
															+
														
 
															+    /** Directory cleaned up. */
														
 
															+    private final Path directory;
														
 
															+
														
 
															+    /**
														
 
															+     * Number of delete calls made across all threads.
														
 
															+     */
														
 
															+    private final int deleteCalls;
														
 
															+
														
 
															+    /**
														
 
															+     * Any IOE raised.
														
 
															+     */
														
 
															+    private final IOException exception;
														
 
															+
														
 
															+    public Result(
														
 
															+        final Outcome outcome,
														
 
															+        final Path directory,
														
 
															+        final int deleteCalls,
														
 
															+        IOException exception) {
														
 
															+      this.outcome = requireNonNull(outcome, "outcome");
														
 
															+      this.directory = directory;
														
 
															+      this.deleteCalls = deleteCalls;
														
 
															+      this.exception = exception;
														
 
															+      if (outcome == Outcome.FAILURE) {
														
 
															+        requireNonNull(exception, "No exception in failure result");
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    public Path getDirectory() {
														
 
															+      return directory;
														
 
															+    }
														
 
															+
														
 
															+    public boolean wasExecuted() {
														
 
															+      return outcome != Outcome.DISABLED;
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Was the outcome a success?
														
 
															+     * That is: either the dir wasn't there or through
														
 
															+     * delete/rename it is no longer there.
														
 
															+     * @return true if the temporary dir no longer exists.
														
 
															+     */
														
 
															+    public boolean succeeded() {
														
 
															+      return outcome.isSuccess();
														
 
															+    }
														
 
															+
														
 
															+    public Outcome getOutcome() {
														
 
															+      return outcome;
														
 
															+    }
														
 
															+
														
 
															+    public int getDeleteCalls() {
														
 
															+      return deleteCalls;
														
 
															+    }
														
 
															+
														
 
															+    public IOException getException() {
														
 
															+      return exception;
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * If there was an IOE caught, throw it.
														
 
															+     * For ease of use in (meaningful) lambda expressions
														
 
															+     * in tests, returns the string value if there
														
 
															+     * was no exception to throw (for use in tests)
														
 
															+     * @throws IOException exception.
														
 
															+     */
														
 
															+    public String maybeRethrowException() throws IOException {
														
 
															+      if (exception != null) {
														
 
															+        throw exception;
														
 
															+      }
														
 
															+      return toString();
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public String toString() {
														
 
															+      return "CleanupResult{" +
														
 
															+          "outcome=" + outcome +
														
 
															+          ", directory=" + directory +
														
 
															+          ", deleteCalls=" + deleteCalls +
														
 
															+          ", exception=" + exception +
														
 
															+          '}';
														
 
															+    }
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitJobStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitJobStage.java
@@ -0,0 +1,245 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.List;
														
 
															+
														
 
															+import javax.annotation.Nullable;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.commons.lang3.tuple.Pair;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+import static org.apache.commons.io.FileUtils.byteCountToDisplaySize;
														
 
															+import static org.apache.commons.lang3.StringUtils.isNotBlank;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_BYTES_COMMITTED_COUNT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_FILES_COMMITTED_COUNT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.MANIFESTS;
														
 
															+
														
 
															+/**
														
 
															+ * Commit the Job.
														
 
															+ * Arguments (save manifest, validate output)
														
 
															+ * Inputs: saveMarker: boolean, validateOutput: boolean
														
 
															+ * Outputs: SuccessData
														
 
															+ */
														
 
															+public class CommitJobStage extends
														
 
															+    AbstractJobOrTaskStage<
														
 
															+            CommitJobStage.Arguments,
														
 
															+            CommitJobStage.Result> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      CommitJobStage.class);
														
 
															+
														
 
															+  public CommitJobStage(final StageConfig stageConfig) {
														
 
															+    super(false, stageConfig, OP_STAGE_JOB_COMMIT, true);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  protected CommitJobStage.Result executeStage(
														
 
															+      final CommitJobStage.Arguments arguments) throws IOException {
														
 
															+
														
 
															+    LOG.info("{}: Committing job \"{}\". resilient commit supported = {}",
														
 
															+        getName(),
														
 
															+        getJobId(),
														
 
															+        storeSupportsResilientCommit());
														
 
															+
														
 
															+    boolean createMarker = arguments.isCreateMarker();
														
 
															+
														
 
															+    // load the manifests
														
 
															+    final StageConfig stageConfig = getStageConfig();
														
 
															+    LoadManifestsStage.Result result
														
 
															+        = new LoadManifestsStage(stageConfig).apply(true);
														
 
															+    List<TaskManifest> manifests = result.getManifests();
														
 
															+    LoadManifestsStage.SummaryInfo summary = result.getSummary();
														
 
															+
														
 
															+    LOG.debug("{}: Job Summary {}", getName(), summary);
														
 
															+    LOG.info("{}: Committing job with file count: {}; total size {} bytes",
														
 
															+        getName(),
														
 
															+        summary.getFileCount(),
														
 
															+        byteCountToDisplaySize(summary.getTotalFileSize()));
														
 
															+
														
 
															+
														
 
															+    // add in the manifest statistics to our local IOStatistics for
														
 
															+    // reporting.
														
 
															+    IOStatisticsStore iostats = getIOStatistics();
														
 
															+    iostats.aggregate(summary.getIOStatistics());
														
 
															+
														
 
															+    // prepare destination directories.
														
 
															+    final CreateOutputDirectoriesStage.Result dirStageResults =
														
 
															+        new CreateOutputDirectoriesStage(stageConfig)
														
 
															+            .apply(manifests);
														
 
															+
														
 
															+    // commit all the tasks.
														
 
															+    // The success data includes a snapshot of the IO Statistics
														
 
															+    // and hence all aggregate stats from the tasks.
														
 
															+    ManifestSuccessData successData;
														
 
															+    successData = new RenameFilesStage(stageConfig).apply(
														
 
															+        Pair.of(manifests, dirStageResults.getCreatedDirectories()));
														
 
															+    if (LOG.isDebugEnabled()) {
														
 
															+      LOG.debug("{}: _SUCCESS file summary {}", getName(), successData.toJson());
														
 
															+    }
														
 
															+    // update the counter of bytes committed and files.
														
 
															+    // use setCounter so as to ignore any values accumulated when
														
 
															+    // aggregating tasks.
														
 
															+    iostats.setCounter(
														
 
															+        COMMITTER_FILES_COMMITTED_COUNT,
														
 
															+        summary.getFileCount());
														
 
															+    iostats.setCounter(
														
 
															+        COMMITTER_BYTES_COMMITTED_COUNT,
														
 
															+        summary.getTotalFileSize());
														
 
															+    successData.snapshotIOStatistics(iostats);
														
 
															+
														
 
															+
														
 
															+    // rename manifests. Only warn on failure here.
														
 
															+    final String manifestRenameDir = arguments.getManifestRenameDir();
														
 
															+    if (isNotBlank(manifestRenameDir)) {
														
 
															+      Path manifestRenamePath = new Path(
														
 
															+          new Path(manifestRenameDir),
														
 
															+          getJobId());
														
 
															+      LOG.info("{}: Renaming manifests to {}", getName(), manifestRenamePath);
														
 
															+      try {
														
 
															+        renameDir(getTaskManifestDir(), manifestRenamePath);
														
 
															+
														
 
															+        // save this path in the summary diagnostics
														
 
															+        successData.getDiagnostics().put(MANIFESTS, manifestRenamePath.toUri().toString());
														
 
															+      } catch (IOException | IllegalArgumentException e) {
														
 
															+        // rename failure, including path for wrong filesystem
														
 
															+        LOG.warn("{}: Failed to rename manifests to {}", getName(), manifestRenamePath, e);
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    // save the _SUCCESS if the option is enabled.
														
 
															+    Path successPath = null;
														
 
															+    if (createMarker) {
														
 
															+      // save a snapshot of the IO Statistics
														
 
															+
														
 
															+      successPath = new SaveSuccessFileStage(stageConfig)
														
 
															+          .apply(successData);
														
 
															+      LOG.debug("{}: Saving _SUCCESS file to {}", getName(), successPath);
														
 
															+    }
														
 
															+
														
 
															+    // optional cleanup
														
 
															+    new CleanupJobStage(stageConfig).apply(arguments.getCleanupArguments());
														
 
															+
														
 
															+    // and then, after everything else: optionally validate.
														
 
															+    if (arguments.isValidateOutput()) {
														
 
															+      // cache and restore the active stage field
														
 
															+      LOG.info("{}: Validating output.", getName());
														
 
															+      new ValidateRenamedFilesStage(stageConfig)
														
 
															+          .apply(result.getManifests());
														
 
															+    }
														
 
															+
														
 
															+    // restore the active stage so that when the report is saved
														
 
															+    // it is declared as job commit, not cleanup or validate.
														
 
															+    stageConfig.enterStage(getStageName(arguments));
														
 
															+
														
 
															+    // the result
														
 
															+    return new CommitJobStage.Result(successPath, successData);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Arguments for job commit.
														
 
															+   */
														
 
															+  public static final class Arguments {
														
 
															+
														
 
															+    /** create the _SUCCESS marker? */
														
 
															+    private final boolean createMarker;
														
 
															+
														
 
															+    /** perform validation checks on the files? */
														
 
															+    private final boolean validateOutput;
														
 
															+
														
 
															+    /** optional directory to rename the task manifests to. */
														
 
															+    private final String manifestRenameDir;
														
 
															+
														
 
															+    /** cleanup arguments.. */
														
 
															+    private final CleanupJobStage.Arguments cleanupArguments;
														
 
															+
														
 
															+    /**
														
 
															+     *
														
 
															+     * @param createMarker create the _SUCCESS marker?
														
 
															+     * @param validateOutput perform validation checks on the files?
														
 
															+     * @param manifestRenameDir optional directory to rename the task manifests to
														
 
															+     * @param cleanupArguments cleanup arguments.
														
 
															+     */
														
 
															+    public Arguments(
														
 
															+        boolean createMarker,
														
 
															+        boolean validateOutput,
														
 
															+        @Nullable String manifestRenameDir,
														
 
															+        CleanupJobStage.Arguments cleanupArguments) {
														
 
															+
														
 
															+      this.createMarker = createMarker;
														
 
															+      this.validateOutput = validateOutput;
														
 
															+      this.manifestRenameDir = manifestRenameDir;
														
 
															+      this.cleanupArguments = requireNonNull(cleanupArguments);
														
 
															+    }
														
 
															+
														
 
															+    public boolean isCreateMarker() {
														
 
															+      return createMarker;
														
 
															+    }
														
 
															+
														
 
															+    public boolean isValidateOutput() {
														
 
															+      return validateOutput;
														
 
															+    }
														
 
															+
														
 
															+    public String getManifestRenameDir() {
														
 
															+      return manifestRenameDir;
														
 
															+    }
														
 
															+
														
 
															+    public CleanupJobStage.Arguments getCleanupArguments() {
														
 
															+      return cleanupArguments;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Result of the stage.
														
 
															+   */
														
 
															+  public static final class Result {
														
 
															+    /**
														
 
															+     * Manifest success data.
														
 
															+     */
														
 
															+    private final ManifestSuccessData jobSuccessData;
														
 
															+
														
 
															+    /**
														
 
															+     * Success file path. null if not saved.
														
 
															+     */
														
 
															+    private final Path successPath;
														
 
															+
														
 
															+    public Result(final Path successPath,
														
 
															+        ManifestSuccessData jobSuccessData) {
														
 
															+      this.successPath = successPath;
														
 
															+      this.jobSuccessData = jobSuccessData;
														
 
															+    }
														
 
															+
														
 
															+    public ManifestSuccessData getJobSuccessData() {
														
 
															+      return jobSuccessData;
														
 
															+    }
														
 
															+
														
 
															+    public Path getSuccessPath() {
														
 
															+      return successPath;
														
 
															+    }
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitTaskStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitTaskStage.java
@@ -0,0 +1,115 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_COMMIT;
														
 
															+
														
 
															+/**
														
 
															+ * Commit a task attempt.
														
 
															+ * Scan the task attempt directories through
														
 
															+ * {@link TaskAttemptScanDirectoryStage}
														
 
															+ * and then save to the task manifest path at
														
 
															+ * {@link SaveTaskManifestStage}.
														
 
															+ */
														
 
															+
														
 
															+public class CommitTaskStage extends
														
 
															+    AbstractJobOrTaskStage<Void, CommitTaskStage.Result> {
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      CommitTaskStage.class);
														
 
															+
														
 
															+  public CommitTaskStage(final StageConfig stageConfig) {
														
 
															+    super(true, stageConfig, OP_STAGE_TASK_COMMIT, false);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Scan the task attempt dir then save the manifest.
														
 
															+   * A snapshot of the IOStats will be included in the manifest;
														
 
															+   * this includes the scan time.
														
 
															+   * @param arguments arguments to the function.
														
 
															+   * @return the path the manifest was saved to, and the manifest.
														
 
															+   * @throws IOException IO failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected CommitTaskStage.Result executeStage(final Void arguments)
														
 
															+      throws IOException {
														
 
															+    LOG.info("{}: Committing task \"{}\"", getName(), getTaskAttemptId());
														
 
															+
														
 
															+    // execute the scan
														
 
															+    final TaskAttemptScanDirectoryStage scanStage =
														
 
															+        new TaskAttemptScanDirectoryStage(getStageConfig());
														
 
															+    TaskManifest manifest = scanStage.apply(arguments);
														
 
															+
														
 
															+    // add the scan as task commit. It's not quite, as it doesn't include
														
 
															+    // the saving, but ...
														
 
															+    scanStage.addExecutionDurationToStatistics(getIOStatistics(), OP_STAGE_TASK_COMMIT);
														
 
															+
														
 
															+    // save a snapshot of the IO Statistics
														
 
															+    final IOStatisticsSnapshot manifestStats = snapshotIOStatistics();
														
 
															+    manifestStats.aggregate(getIOStatistics());
														
 
															+    manifest.setIOStatistics(manifestStats);
														
 
															+
														
 
															+    // Now save with rename
														
 
															+    Path manifestPath = new SaveTaskManifestStage(getStageConfig())
														
 
															+        .apply(manifest);
														
 
															+    return new CommitTaskStage.Result(manifestPath, manifest);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Result of the stage.
														
 
															+   */
														
 
															+  public static final class Result {
														
 
															+    /** The path the manifest was saved to. */
														
 
															+    private final Path path;
														
 
															+    /** The manifest. */
														
 
															+    private final TaskManifest taskManifest;
														
 
															+
														
 
															+    public Result(Path path,
														
 
															+        TaskManifest taskManifest) {
														
 
															+      this.path = path;
														
 
															+      this.taskManifest = taskManifest;
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Get the manifest path.
														
 
															+     * @return The path the manifest was saved to.
														
 
															+     */
														
 
															+    public Path getPath() {
														
 
															+      return path;
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Get the manifest.
														
 
															+     * @return The manifest.
														
 
															+     */
														
 
															+    public TaskManifest getTaskManifest() {
														
 
															+      return taskManifest;
														
 
															+    }
														
 
															+
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java
@@ -0,0 +1,423 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.time.Duration;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.Collections;
														
 
															+import java.util.HashMap;
														
 
															+import java.util.HashSet;
														
 
															+import java.util.List;
														
 
															+import java.util.Map;
														
 
															+import java.util.Set;
														
 
															+import java.util.concurrent.ConcurrentHashMap;
														
 
															+import java.util.concurrent.atomic.AtomicInteger;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.EntryStatus;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.util.functional.TaskPool;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE;
														
 
															+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.measureDurationOfInvocation;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_CREATE_DIRECTORIES;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_DELETE_FILE_UNDER_DESTINATION;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_MKDIRS_RETURNED_FALSE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_PREPARE_DIR_ANCESTORS;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CREATE_TARGET_DIRS;
														
 
															+import static org.apache.hadoop.util.OperationDuration.humanTime;
														
 
															+
														
 
															+/**
														
 
															+ * Prepare the destination directory tree, as efficiently as possible.
														
 
															+ * possible -and doing those IO operations in the thread pool.
														
 
															+ *
														
 
															+ * The classic FileOutputCommitter does a recursive treewalk and
														
 
															+ * deletes any files found at paths where directories are to be created.
														
 
															+ *
														
 
															+ * Each task manifest's directories are combined with those of the other tasks
														
 
															+ * to build a set of all directories which are needed, without duplicates.
														
 
															+ *
														
 
															+ * This stage requires the aggregate set of manifests to contain
														
 
															+ * all directories to create, including level,
														
 
															+ * and expects them to have been probed for existence/state.
														
 
															+ *
														
 
															+ * For each level, all dirs are processed in parallel to
														
 
															+ * be created or, if files, deleted.
														
 
															+ *
														
 
															+ * The stage returns the list of directories created, and for testing,
														
 
															+ * the map of paths to outcomes.
														
 
															+ *
														
 
															+ * Directory creation can be surprisingly slow against object stores,
														
 
															+ * do use benchmarks from real test runs when tuning this algorithm.
														
 
															+ */
														
 
															+public class CreateOutputDirectoriesStage extends
														
 
															+    AbstractJobOrTaskStage<
														
 
															+        List<TaskManifest>,
														
 
															+        CreateOutputDirectoriesStage.Result> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      CreateOutputDirectoriesStage.class);
														
 
															+
														
 
															+  /**
														
 
															+   * Directories as a map of (path, path).
														
 
															+   * Using a map rather than any set for efficient concurrency; the
														
 
															+   * concurrent sets don't do lookups so fast.
														
 
															+   */
														
 
															+  private final Map<Path, DirMapState> dirMap = new ConcurrentHashMap<>();
														
 
															+
														
 
															+  /**
														
 
															+   * A list of created paths for the results.
														
 
															+   */
														
 
															+  private final List<Path> createdDirectories = new ArrayList<>();
														
 
															+
														
 
															+  public CreateOutputDirectoriesStage(final StageConfig stageConfig) {
														
 
															+    super(false, stageConfig, OP_STAGE_JOB_CREATE_TARGET_DIRS, true);
														
 
															+    // add the dest dir to the dir map as we expect the job setup to create it.
														
 
															+    dirMap.put(getDestinationDir(), DirMapState.dirWasCreated);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  protected Result executeStage(
														
 
															+      final List<TaskManifest> taskManifests)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    final List<Path> directories = createAllDirectories(taskManifests);
														
 
															+    LOG.debug("{}: Created {} directories", getName(), directories.size());
														
 
															+    return new Result(new HashSet<>(directories), dirMap);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * For each task, build the list of directories it wants.
														
 
															+   * @param taskManifests task manifests
														
 
															+   * @return the list of paths which have been created.
														
 
															+   */
														
 
															+  private List<Path> createAllDirectories(final List<TaskManifest> taskManifests)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    // all directories which need to exist across all
														
 
															+    // tasks.
														
 
															+    // leaf directories
														
 
															+    final Map<Path, DirEntry> leaves = new HashMap<>();
														
 
															+    // parent directories. these do not need to be
														
 
															+    // explicitly created.
														
 
															+    final Map<Path, DirEntry> parents = new HashMap<>();
														
 
															+    // the files which must be deleted as a directory
														
 
															+    // will be created at that path.
														
 
															+    final Set<Path> filesToDelete = new HashSet<>();
														
 
															+
														
 
															+    // iterate through the task manifests
														
 
															+    // and all output dirs into the set of dirs to
														
 
															+    // create.
														
 
															+    // hopefully there is a lot of overlap, so the
														
 
															+    // final number of dirs to create is small.
														
 
															+    for (TaskManifest task: taskManifests) {
														
 
															+      final List<DirEntry> destDirectories = task.getDestDirectories();
														
 
															+      Collections.sort(destDirectories, (o1, o2) ->
														
 
															+          o1.getLevel() - o2.getLevel());
														
 
															+      for (DirEntry entry: destDirectories) {
														
 
															+        // add the dest entry
														
 
															+        final Path path = entry.getDestPath();
														
 
															+        if (!leaves.containsKey(path)) {
														
 
															+          leaves.put(path, entry);
														
 
															+
														
 
															+          // if it is a file to delete, record this.
														
 
															+          if (entry.getStatus() == EntryStatus.file) {
														
 
															+            filesToDelete.add(path);
														
 
															+          }
														
 
															+          final Path parent = path.getParent();
														
 
															+          if (parent != null && leaves.containsKey(parent)) {
														
 
															+            // there's a parent dir, move it from the leaf list
														
 
															+            // to parent list
														
 
															+            parents.put(parent,
														
 
															+                leaves.remove(parent));
														
 
															+          }
														
 
															+        }
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    // at this point then there is a map of all directories which
														
 
															+    // are leaf entries and so need to be created if not present,
														
 
															+    // and the maximum level is known.
														
 
															+    // we can iterate through all levels deleting any files if there are any.
														
 
															+
														
 
															+    // Prepare parent directories.
														
 
															+    deleteFiles(filesToDelete);
														
 
															+
														
 
															+    // Now the real work.
														
 
															+    final int createCount = leaves.size();
														
 
															+    LOG.info("Preparing {} directory/directories", createCount);
														
 
															+    // now probe for and create the leaf dirs, which are those at the
														
 
															+    // bottom level
														
 
															+    Duration d = measureDurationOfInvocation(getIOStatistics(), OP_CREATE_DIRECTORIES, () ->
														
 
															+        TaskPool.foreach(leaves.values())
														
 
															+            .executeWith(getIOProcessors(createCount))
														
 
															+            .onFailure(this::reportMkDirFailure)
														
 
															+            .stopOnFailure()
														
 
															+            .run(this::createOneDirectory));
														
 
															+    LOG.info("Time to prepare directories {}", humanTime(d.toMillis()));
														
 
															+    return createdDirectories;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * How many failures have been reported.
														
 
															+   */
														
 
															+  private final AtomicInteger failureCount = new AtomicInteger();
														
 
															+
														
 
															+  /**
														
 
															+   * report a single directory failure.
														
 
															+   * @param path path which could not be deleted
														
 
															+   * @param e exception raised.
														
 
															+   */
														
 
															+  private void reportMkDirFailure(DirEntry dirEntry, Exception e) {
														
 
															+    Path path = dirEntry.getDestPath();
														
 
															+    final int count = failureCount.incrementAndGet();
														
 
															+    LOG.warn("{}: mkdir failure #{} Failed to create directory \"{}\": {}",
														
 
															+        getName(), count, path, e.toString());
														
 
															+    LOG.debug("{}: Full exception details",
														
 
															+        getName(), e);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Delete all directories where there is a file.
														
 
															+   * @param filesToDelete set of dirs to where there is a file.
														
 
															+   * @throws IOException IO problem
														
 
															+   */
														
 
															+  private void deleteFiles(final Set<Path> filesToDelete)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    final int size = filesToDelete.size();
														
 
															+    if (size == 0) {
														
 
															+      // nothing to delete.
														
 
															+      return;
														
 
															+    }
														
 
															+    LOG.info("{}: Directory entries containing files to delete: {}", getName(), size);
														
 
															+    Duration d = measureDurationOfInvocation(getIOStatistics(),
														
 
															+        OP_PREPARE_DIR_ANCESTORS, () ->
														
 
															+            TaskPool.foreach(filesToDelete)
														
 
															+                .executeWith(getIOProcessors(size))
														
 
															+                .stopOnFailure()
														
 
															+                .run(dir -> {
														
 
															+                  updateAuditContext(OP_PREPARE_DIR_ANCESTORS);
														
 
															+                  deleteDirWithFile(dir);
														
 
															+                }));
														
 
															+    LOG.info("Time to delete files {}", humanTime(d.toMillis()));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Prepare a parent directory.
														
 
															+   * @param dir directory to probe
														
 
															+   * @throws IOException failure in probe other than FNFE
														
 
															+   */
														
 
															+  private void deleteDirWithFile(Path dir) throws IOException {
														
 
															+    // report progress back
														
 
															+    progress();
														
 
															+    LOG.info("{}: Deleting file {}", getName(), dir);
														
 
															+    delete(dir, false, OP_DELETE);
														
 
															+    // note its final state
														
 
															+    addToDirectoryMap(dir, DirMapState.fileNowDeleted);
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Create a directory is required, updating the directory map
														
 
															+   * and, if the operation took place, the list of created dirs.
														
 
															+   * Reports progress on invocation.
														
 
															+   * @param dirEntry entry
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  private void createOneDirectory(final DirEntry dirEntry) throws IOException {
														
 
															+    // report progress back
														
 
															+    progress();
														
 
															+    final Path dir = dirEntry.getDestPath();
														
 
															+    updateAuditContext(OP_STAGE_JOB_CREATE_TARGET_DIRS);
														
 
															+    final DirMapState state = maybeCreateOneDirectory(dirEntry);
														
 
															+    switch (state) {
														
 
															+    case dirFoundInStore:
														
 
															+      addToDirectoryMap(dir, state);
														
 
															+      break;
														
 
															+    case dirWasCreated:
														
 
															+    case dirCreatedOnSecondAttempt:
														
 
															+      addCreatedDirectory(dir);
														
 
															+      addToDirectoryMap(dir, state);
														
 
															+      break;
														
 
															+    default:
														
 
															+      break;
														
 
															+    }
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Try to efficiently and robustly create a directory in a method which is
														
 
															+   * expected to be executed in parallel with operations creating
														
 
															+   * peer directories.
														
 
															+   * @param path path to create
														
 
															+   * @return true if dir created/found
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  private DirMapState maybeCreateOneDirectory(DirEntry dirEntry) throws IOException {
														
 
															+    final EntryStatus status = dirEntry.getStatus();
														
 
															+    if (status == EntryStatus.dir) {
														
 
															+      return DirMapState.dirFoundInStore;
														
 
															+    }
														
 
															+    // present in case directories are ever created in task commits
														
 
															+    if (status == EntryStatus.created_dir) {
														
 
															+      return DirMapState.dirWasCreated;
														
 
															+    }
														
 
															+
														
 
															+    // here the dir doesn't exist because
														
 
															+    // it was a file and has been deleted, or
														
 
															+    // checks failed. create it.
														
 
															+    final Path path = dirEntry.getDestPath();
														
 
															+
														
 
															+    LOG.info("Creating directory {}", path);
														
 
															+
														
 
															+    try {
														
 
															+      if (mkdirs(path, false)) {
														
 
															+        // success -return immediately.
														
 
															+        return DirMapState.dirWasCreated;
														
 
															+      }
														
 
															+      getIOStatistics().incrementCounter(OP_MKDIRS_RETURNED_FALSE);
														
 
															+
														
 
															+      LOG.info("{}: mkdirs({}) returned false, attempting to recover",
														
 
															+          getName(), path);
														
 
															+    } catch (IOException e) {
														
 
															+      // can be caused by file existing, etc.
														
 
															+      LOG.info("{}: mkdir({}) raised exception {}", getName(), path, e.toString());
														
 
															+      LOG.debug("{}: Mkdir stack", getName(), e);
														
 
															+    }
														
 
															+
														
 
															+    // fallback to checking the FS, in case a different process did it.
														
 
															+    FileStatus st = getFileStatusOrNull(path);
														
 
															+    if (st != null) {
														
 
															+      if (!st.isDirectory()) {
														
 
															+        // is bad: delete a file
														
 
															+        LOG.info("{}: Deleting file where a directory should go: {}",
														
 
															+            getName(), st);
														
 
															+        delete(path, false, OP_DELETE_FILE_UNDER_DESTINATION);
														
 
															+      } else {
														
 
															+        // is good.
														
 
															+        LOG.warn("{}: Even though mkdirs({}) failed, there is now a directory there",
														
 
															+            getName(), path);
														
 
															+        return DirMapState.dirFoundInStore;
														
 
															+      }
														
 
															+    } else {
														
 
															+      // nothing found. This should never happen.
														
 
															+      LOG.warn("{}: Although mkdirs({}) returned false, there's nothing at that path to prevent it",
														
 
															+          getName(), path);
														
 
															+
														
 
															+    }
														
 
															+
														
 
															+    // try to create the directory again
														
 
															+    // if this fails, and IOE is still raised, that
														
 
															+    // propagate to the caller.
														
 
															+    if (!mkdirs(path, false)) {
														
 
															+
														
 
															+      // mkdirs failed again
														
 
															+      getIOStatistics().incrementCounter(OP_MKDIRS_RETURNED_FALSE);
														
 
															+
														
 
															+      // require the dir to exist, raising an exception if it does not.
														
 
															+      directoryMustExist("Creating directory ", path);
														
 
															+    }
														
 
															+
														
 
															+    // we only get here if the second attempt recovered
														
 
															+    return DirMapState.dirCreatedOnSecondAttempt;
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a created dir to the list of created dirs.
														
 
															+   * @param dir new dir.
														
 
															+   */
														
 
															+  private void addCreatedDirectory(final Path dir) {
														
 
															+    synchronized (createdDirectories) {
														
 
															+      createdDirectories.add(dir);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a dir  to the directory map if there is not already an entry there.
														
 
															+   * @param dir directory.
														
 
															+   * @param state state of entry
														
 
															+   */
														
 
															+  private void addToDirectoryMap(final Path dir,
														
 
															+      DirMapState state) {
														
 
															+    if (!dirMap.containsKey(dir)) {
														
 
															+      dirMap.put(dir, state);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Result of the operation.
														
 
															+   */
														
 
															+  public static final class Result {
														
 
															+
														
 
															+    /** directories created. */
														
 
															+    private final Set<Path> createdDirectories;
														
 
															+
														
 
															+    /**
														
 
															+     * Map of dirs built up during preparation.
														
 
															+     */
														
 
															+    private final Map<Path, DirMapState> dirMap;
														
 
															+
														
 
															+    public Result(Set<Path> createdDirectories,
														
 
															+        Map<Path, DirMapState> dirMap) {
														
 
															+      this.createdDirectories = requireNonNull(createdDirectories);
														
 
															+      this.dirMap = requireNonNull(dirMap);
														
 
															+    }
														
 
															+
														
 
															+    public Set<Path> getCreatedDirectories() {
														
 
															+      return createdDirectories;
														
 
															+    }
														
 
															+
														
 
															+    public Map<Path, DirMapState> getDirMap() {
														
 
															+      return dirMap;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public String toString() {
														
 
															+      return "Result{" +
														
 
															+          "directory count=" + createdDirectories.size() +
														
 
															+          '}';
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Enumeration of dir states in the dir map.
														
 
															+   */
														
 
															+  public enum DirMapState {
														
 
															+    dirFoundInStore,
														
 
															+    dirFoundInMap,
														
 
															+    dirWasCreated,
														
 
															+    dirCreatedOnSecondAttempt,
														
 
															+    fileNowDeleted,
														
 
															+    ancestorWasDirOrMissing,
														
 
															+    parentWasNotFile,
														
 
															+    parentOfCreatedDir
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/JobOrTaskStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/JobOrTaskStage.java
@@ -0,0 +1,33 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
														
 
															+import org.apache.hadoop.util.functional.FunctionRaisingIOE;
														
 
															+
														
 
															+/**
														
 
															+ * Job Stage is a function raising an IOE. and which also
														
 
															+ * provides IOStats.
														
 
															+ * @param <IN> Type of arguments to the stage.
														
 
															+ * @param <OUT> Type of result.
														
 
															+ */
														
 
															+public interface JobOrTaskStage<IN, OUT> extends FunctionRaisingIOE<IN, OUT>,
														
 
															+    IOStatisticsSource {
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/LoadManifestsStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/LoadManifestsStage.java
@@ -0,0 +1,293 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.List;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.PathIOException;
														
 
															+import org.apache.hadoop.fs.RemoteIterator;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.util.functional.TaskPool;
														
 
															+
														
 
															+import static org.apache.commons.io.FileUtils.byteCountToDisplaySize;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics;
														
 
															+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfInvocation;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_MANIFEST_FILE_SIZE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_LOAD_ALL_MANIFESTS;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_LOAD_MANIFESTS;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.maybeAddIOStatistics;
														
 
															+
														
 
															+/**
														
 
															+ * Stage to load all the task manifests in the job attempt directory.
														
 
															+ * Invoked in Job Commit.
														
 
															+ * Manifests are loaded in parallel.
														
 
															+ * The IOStatistics snapshot passed in is built up with the statistics
														
 
															+ * and the statistics stripped from the manifest if prune == true.
														
 
															+ * This keeps the memory footprint of each manifest down.
														
 
															+ */
														
 
															+public class LoadManifestsStage extends
														
 
															+    AbstractJobOrTaskStage<
														
 
															+        Boolean,
														
 
															+        LoadManifestsStage.Result> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      LoadManifestsStage.class);
														
 
															+
														
 
															+  /**
														
 
															+   * Summary of manifest loading.
														
 
															+   */
														
 
															+  private final SummaryInfo summaryInfo = new SummaryInfo();
														
 
															+
														
 
															+  /**
														
 
															+   * Should manifests be pruned of IOStatistics?
														
 
															+   */
														
 
															+  private boolean pruneManifests;
														
 
															+
														
 
															+  /**
														
 
															+   * List of loaded manifests.
														
 
															+   */
														
 
															+  private final List<TaskManifest> manifests = new ArrayList<>();
														
 
															+
														
 
															+  public LoadManifestsStage(final StageConfig stageConfig) {
														
 
															+    super(false, stageConfig, OP_STAGE_JOB_LOAD_MANIFESTS, true);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load the manifests.
														
 
															+   * @param prune should manifests be pruned of IOStatistics?
														
 
															+   * @return the summary and a list of manifests.
														
 
															+   * @throws IOException IO failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected LoadManifestsStage.Result executeStage(
														
 
															+      final Boolean prune) throws IOException {
														
 
															+
														
 
															+    final Path manifestDir = getTaskManifestDir();
														
 
															+    LOG.info("{}: Executing Manifest Job Commit with manifests in {}",
														
 
															+        getName(),
														
 
															+        manifestDir);
														
 
															+    pruneManifests = prune;
														
 
															+    // build a list of all task manifests successfully committed
														
 
															+    //
														
 
															+    msync(manifestDir);
														
 
															+    final RemoteIterator<FileStatus> manifestFiles = listManifests();
														
 
															+
														
 
															+    final List<TaskManifest> manifestList = loadAllManifests(manifestFiles);
														
 
															+    LOG.info("{}: Summary of {} manifests loaded in {}: {}",
														
 
															+        getName(),
														
 
															+        manifestList.size(),
														
 
															+        manifestDir,
														
 
															+        summaryInfo);
														
 
															+
														
 
															+    // collect any stats
														
 
															+    maybeAddIOStatistics(getIOStatistics(), manifestFiles);
														
 
															+    return new LoadManifestsStage.Result(summaryInfo, manifestList);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load all the manifests.
														
 
															+   * @param manifestFiles list of manifest files.
														
 
															+   * @return the loaded manifests.
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  private List<TaskManifest> loadAllManifests(
														
 
															+      final RemoteIterator<FileStatus> manifestFiles) throws IOException {
														
 
															+
														
 
															+    trackDurationOfInvocation(getIOStatistics(), OP_LOAD_ALL_MANIFESTS, () ->
														
 
															+        TaskPool.foreach(manifestFiles)
														
 
															+            .executeWith(getIOProcessors())
														
 
															+            .stopOnFailure()
														
 
															+            .run(this::processOneManifest));
														
 
															+    return manifests;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Method invoked to process one manifest.
														
 
															+   * @param status file to process.
														
 
															+   * @throws IOException failure to load/parse
														
 
															+   */
														
 
															+  private void processOneManifest(FileStatus status)
														
 
															+      throws IOException {
														
 
															+    updateAuditContext(OP_LOAD_ALL_MANIFESTS);
														
 
															+
														
 
															+    TaskManifest m = fetchTaskManifest(status);
														
 
															+    progress();
														
 
															+
														
 
															+    // update the manifest list in a synchronized block.
														
 
															+
														
 
															+    synchronized (manifests) {
														
 
															+      manifests.add(m);
														
 
															+      // and the summary info in the same block, to
														
 
															+      // eliminate the need to acquire a second lock.
														
 
															+      summaryInfo.add(m);
														
 
															+    }
														
 
															+    if (pruneManifests) {
														
 
															+      m.setIOStatistics(null);
														
 
															+      m.getExtraData().clear();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Precommit preparation of a single manifest file.
														
 
															+   * To reduce the memory foot print, the IOStatistics and
														
 
															+   * extra data of each manifest is cleared.
														
 
															+   * @param status status of file.
														
 
															+   * @return number of files.
														
 
															+   * @throws IOException IO Failure.
														
 
															+   */
														
 
															+  private TaskManifest fetchTaskManifest(FileStatus status)
														
 
															+      throws IOException {
														
 
															+    if (status.getLen() == 0 || !status.isFile()) {
														
 
															+      throw new PathIOException(status.getPath().toString(),
														
 
															+          "Not a valid manifest file; file status = " + status);
														
 
															+    }
														
 
															+    // load the manifest, which includes validation.
														
 
															+    final TaskManifest manifest = loadManifest(status);
														
 
															+    final String id = manifest.getTaskAttemptID();
														
 
															+    final int filecount = manifest.getFilesToCommit().size();
														
 
															+    final long size = manifest.getTotalFileSize();
														
 
															+    LOG.info("{}: Task Attempt {} file {}: File count: {}; data size={}",
														
 
															+        getName(), id, status.getPath(), filecount, size);
														
 
															+    // record file size for tracking of memory consumption.
														
 
															+    getIOStatistics().addMeanStatisticSample(COMMITTER_TASK_MANIFEST_FILE_SIZE,
														
 
															+        status.getLen());
														
 
															+    return manifest;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Result of the stage.
														
 
															+   */
														
 
															+  public static final class Result {
														
 
															+    private final SummaryInfo summary;
														
 
															+
														
 
															+    private final List<TaskManifest> manifests;
														
 
															+
														
 
															+    public Result(SummaryInfo summary,
														
 
															+        List<TaskManifest> manifests) {
														
 
															+      this.summary = summary;
														
 
															+      this.manifests = manifests;
														
 
															+    }
														
 
															+
														
 
															+    public SummaryInfo getSummary() {
														
 
															+      return summary;
														
 
															+    }
														
 
															+
														
 
															+    public List<TaskManifest> getManifests() {
														
 
															+      return manifests;
														
 
															+    }
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Summary information.
														
 
															+   */
														
 
															+  public static final class SummaryInfo implements IOStatisticsSource {
														
 
															+
														
 
															+    /**
														
 
															+     * Aggregate IOStatistics.
														
 
															+     */
														
 
															+    private IOStatisticsSnapshot iostatistics = snapshotIOStatistics();
														
 
															+
														
 
															+    /**
														
 
															+     * How many manifests were loaded.
														
 
															+     */
														
 
															+    private long manifestCount;
														
 
															+
														
 
															+    /**
														
 
															+     * Total number of files to rename.
														
 
															+     */
														
 
															+    private long fileCount;
														
 
															+
														
 
															+    /**
														
 
															+     * Total number of directories which may need
														
 
															+     * to be created.
														
 
															+     * As there is no dedup, this is likely to be
														
 
															+     * a (major) overestimate.
														
 
															+     */
														
 
															+    private long directoryCount;
														
 
															+
														
 
															+    /**
														
 
															+     * Total amount of data to be committed.
														
 
															+     */
														
 
															+    private long totalFileSize;
														
 
															+
														
 
															+    /**
														
 
															+     * Get the IOStatistics.
														
 
															+     * @return aggregate IOStatistics
														
 
															+     */
														
 
															+    @Override
														
 
															+    public IOStatisticsSnapshot getIOStatistics() {
														
 
															+      return iostatistics;
														
 
															+    }
														
 
															+
														
 
															+    public long getFileCount() {
														
 
															+      return fileCount;
														
 
															+    }
														
 
															+
														
 
															+    public long getDirectoryCount() {
														
 
															+      return directoryCount;
														
 
															+    }
														
 
															+
														
 
															+    public long getTotalFileSize() {
														
 
															+      return totalFileSize;
														
 
															+    }
														
 
															+
														
 
															+    public long getManifestCount() {
														
 
															+      return manifestCount;
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Add all statistics.
														
 
															+     * @param manifest manifest to add.
														
 
															+     */
														
 
															+    public void add(TaskManifest manifest) {
														
 
															+      manifestCount++;
														
 
															+      iostatistics.aggregate(manifest.getIOStatistics());
														
 
															+      fileCount += manifest.getFilesToCommit().size();
														
 
															+      directoryCount += manifest.getDestDirectories().size();
														
 
															+      totalFileSize += manifest.getTotalFileSize();
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * To String includes all summary info except statistics.
														
 
															+     * @return string value
														
 
															+     */
														
 
															+    @Override
														
 
															+    public String toString() {
														
 
															+      final StringBuilder sb = new StringBuilder(
														
 
															+          "SummaryInfo{");
														
 
															+      sb.append("manifestCount=").append(manifestCount);
														
 
															+      sb.append(", fileCount=").append(fileCount);
														
 
															+      sb.append(", directoryCount=").append(directoryCount);
														
 
															+      sb.append(", totalFileSize=").append(
														
 
															+          byteCountToDisplaySize(totalFileSize));
														
 
															+      sb.append('}');
														
 
															+      return sb.toString();
														
 
															+    }
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/RenameFilesStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/RenameFilesStage.java
@@ -0,0 +1,173 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.List;
														
 
															+import java.util.Set;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.commons.lang3.tuple.Pair;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.util.functional.TaskPool;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER_FILE_LIMIT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_RENAME_FILES;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createManifestOutcome;
														
 
															+import static org.apache.hadoop.thirdparty.com.google.common.collect.Iterables.concat;
														
 
															+
														
 
															+/**
														
 
															+ * This stage renames all the files.
														
 
															+ * Input: the manifests and the set of directories created, as returned by
														
 
															+ * {@link CreateOutputDirectoriesStage}.
														
 
															+ * If the job is configured to delete target files, if the parent dir
														
 
															+ * had to be created, the delete() call can be skipped.
														
 
															+ * It returns a manifest success data file summarizing the
														
 
															+ * output, but does not add iostatistics to it.
														
 
															+ */
														
 
															+public class RenameFilesStage extends
														
 
															+    AbstractJobOrTaskStage<
														
 
															+        Pair<List<TaskManifest>, Set<Path>>,
														
 
															+        ManifestSuccessData> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      RenameFilesStage.class);
														
 
															+
														
 
															+  /**
														
 
															+   * List of all files committed.
														
 
															+   */
														
 
															+  private final List<FileEntry> filesCommitted = new ArrayList<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Total file size.
														
 
															+   */
														
 
															+  private long totalFileSize = 0;
														
 
															+
														
 
															+  private Set<Path> createdDirectories;
														
 
															+
														
 
															+  public RenameFilesStage(final StageConfig stageConfig) {
														
 
															+    super(false, stageConfig, OP_STAGE_JOB_RENAME_FILES, true);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the list of files committed.
														
 
															+   * Access is not synchronized.
														
 
															+   * @return direct access to the list of files.
														
 
															+   */
														
 
															+  public synchronized  List<FileEntry> getFilesCommitted() {
														
 
															+    return filesCommitted;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the total file size of the committed task.
														
 
															+   * @return a number greater than or equal to zero.
														
 
															+   */
														
 
															+  public synchronized long getTotalFileSize() {
														
 
															+    return totalFileSize;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Rename files in job commit.
														
 
															+   * @param taskManifests a list of task manifests containing files.
														
 
															+   * @return the job report.
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected ManifestSuccessData executeStage(
														
 
															+      Pair<List<TaskManifest>, Set<Path>> args)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    final List<TaskManifest> taskManifests = args.getLeft();
														
 
															+    createdDirectories = args.getRight();
														
 
															+
														
 
															+    final ManifestSuccessData success = createManifestOutcome(getStageConfig(),
														
 
															+        OP_STAGE_JOB_COMMIT);
														
 
															+    final int manifestCount = taskManifests.size();
														
 
															+
														
 
															+    LOG.info("{}: Executing Manifest Job Commit with {} manifests in {}",
														
 
															+        getName(), manifestCount, getTaskManifestDir());
														
 
															+
														
 
															+    // first step is to aggregate the output of all manifests into a single
														
 
															+    // list of files to commit.
														
 
															+    // Which Guava can do in a zero-copy concatenated iterator
														
 
															+
														
 
															+    final Iterable<FileEntry> filesToCommit = concat(taskManifests.stream()
														
 
															+        .map(TaskManifest::getFilesToCommit)
														
 
															+        .collect(Collectors.toList()));
														
 
															+
														
 
															+    TaskPool.foreach(filesToCommit)
														
 
															+        .executeWith(getIOProcessors())
														
 
															+        .stopOnFailure()
														
 
															+        .run(this::commitOneFile);
														
 
															+
														
 
															+    // synchronized block to keep spotbugs happy.
														
 
															+    List<FileEntry> committed = getFilesCommitted();
														
 
															+    LOG.info("{}: Files committed: {}. Total size {}",
														
 
															+        getName(), committed.size(), getTotalFileSize());
														
 
															+
														
 
															+    // Add a subset of the destination files to the success file;
														
 
															+    // enough for simple testing
														
 
															+    success.setFilenamePaths(
														
 
															+        committed
														
 
															+            .subList(0, Math.min(committed.size(), SUCCESS_MARKER_FILE_LIMIT))
														
 
															+            .stream().map(FileEntry::getDestPath)
														
 
															+            .collect(Collectors.toList()));
														
 
															+
														
 
															+    success.setSuccess(true);
														
 
															+
														
 
															+    return success;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Commit one file by rename, then, if that doesn't fail,
														
 
															+   * add to the files committed list.
														
 
															+   * @param entry entry to commit.
														
 
															+   * @throws IOException faiure.
														
 
															+   */
														
 
															+  private void commitOneFile(FileEntry entry) throws IOException {
														
 
															+    updateAuditContext(OP_STAGE_JOB_RENAME_FILES);
														
 
															+
														
 
															+    // report progress back
														
 
															+    progress();
														
 
															+
														
 
															+    // if the dest dir is to be deleted,
														
 
															+    // look to see if the parent dir was created.
														
 
															+    // if it was. we know that the file doesn't exist.
														
 
															+    final boolean deleteDest = getStageConfig().getDeleteTargetPaths()
														
 
															+        && !createdDirectories.contains(entry.getDestPath().getParent());
														
 
															+    // do the rename
														
 
															+    commitFile(entry, deleteDest);
														
 
															+
														
 
															+    // update the list and IOStats
														
 
															+    synchronized (this) {
														
 
															+      filesCommitted.add(entry);
														
 
															+      totalFileSize += entry.getSize();
														
 
															+    }
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveSuccessFileStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveSuccessFileStage.java
@@ -0,0 +1,79 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.TMP_SUFFIX;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_SAVE_SUCCESS;
														
 
															+
														
 
															+/**
														
 
															+ * Save the _SUCCESS file to the destination directory
														
 
															+ * via a temp file in the job attempt dir.
														
 
															+ * Returns the path of the file
														
 
															+ */
														
 
															+public class SaveSuccessFileStage extends
														
 
															+    AbstractJobOrTaskStage<ManifestSuccessData, Path> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      SaveSuccessFileStage.class);
														
 
															+
														
 
															+  public SaveSuccessFileStage(final StageConfig stageConfig) {
														
 
															+    super(false, stageConfig, OP_STAGE_JOB_SAVE_SUCCESS, false);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Stage name is always job commit.
														
 
															+   * @param arguments args to the invocation.
														
 
															+   * @return stage name
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected String getStageName(ManifestSuccessData arguments) {
														
 
															+    // set it to the job commit stage, always.
														
 
															+    return OP_STAGE_JOB_COMMIT;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute.
														
 
															+   * @param successData success data to save
														
 
															+   * @return path saved to.
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected Path executeStage(final ManifestSuccessData successData)
														
 
															+      throws IOException {
														
 
															+    // Save the marker
														
 
															+    Path successFile = getStageConfig().getJobSuccessMarkerPath();
														
 
															+    Path successTempFile = new Path(getJobAttemptDir(), SUCCESS_MARKER + TMP_SUFFIX);
														
 
															+    LOG.debug("{}: Saving _SUCCESS file to {} via {}", successFile,
														
 
															+        getName(),
														
 
															+        successTempFile);
														
 
															+    save(successData, successTempFile, successFile);
														
 
															+    return successFile;
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java
@@ -0,0 +1,81 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_SAVE_MANIFEST;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestPathForTask;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestTempPathForTaskAttempt;
														
 
															+
														
 
															+/**
														
 
															+ * Save a task manifest to the job attempt dir, using the task
														
 
															+ * ID for the name of the final file.
														
 
															+ * For atomic writes, the manifest is saved
														
 
															+ * by writing to a temp file and then renaming it.
														
 
															+ * Uses both the task ID and task attempt ID to determine the temp filename;
														
 
															+ * Before the rename of (temp, final-path), any file at the final path
														
 
															+ * is deleted.
														
 
															+ * This is so that when this stage is invoked in a task commit, its output
														
 
															+ * overwrites any of the first commit.
														
 
															+ * When it succeeds, therefore, unless there is any subsequent commit of
														
 
															+ * another task, the task manifest at the final path is from this
														
 
															+ * operation.
														
 
															+ *
														
 
															+ * Returns the path where the manifest was saved.
														
 
															+ */
														
 
															+public class SaveTaskManifestStage extends
														
 
															+    AbstractJobOrTaskStage<TaskManifest, Path> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      SaveTaskManifestStage.class);
														
 
															+
														
 
															+  public SaveTaskManifestStage(final StageConfig stageConfig) {
														
 
															+    super(true, stageConfig, OP_STAGE_TASK_SAVE_MANIFEST, false);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Save the manifest to a temp file and rename to the final
														
 
															+   * manifest destination.
														
 
															+   * @param manifest manifest
														
 
															+   * @return the path to the final entry
														
 
															+   * @throws IOException IO failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected Path executeStage(final TaskManifest manifest)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    final Path manifestDir = getTaskManifestDir();
														
 
															+    // final manifest file is by task ID
														
 
															+    Path manifestFile = manifestPathForTask(manifestDir,
														
 
															+        getRequiredTaskId());
														
 
															+    Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir,
														
 
															+        getRequiredTaskAttemptId());
														
 
															+    LOG.info("{}: Saving manifest file to {}", getName(), manifestFile);
														
 
															+    save(manifest, manifestTempFile, manifestFile);
														
 
															+    return manifestFile;
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupJobStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupJobStage.java
@@ -0,0 +1,63 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_SETUP;
														
 
															+
														
 
															+/**
														
 
															+ * Stage to set up a job by creating the job attempt directory.
														
 
															+ * The job attempt directory must not exist before the call.
														
 
															+ */
														
 
															+public class SetupJobStage extends
														
 
															+    AbstractJobOrTaskStage<Boolean, Path> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      SetupJobStage.class);
														
 
															+
														
 
															+  public SetupJobStage(final StageConfig stageConfig) {
														
 
															+    super(false, stageConfig, OP_STAGE_JOB_SETUP, false);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute the job setup stage.
														
 
															+   * @param deleteMarker: should any success marker be deleted.
														
 
															+   * @return the job attempted directory.
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected Path executeStage(final Boolean deleteMarker) throws IOException {
														
 
															+    final Path path = getJobAttemptDir();
														
 
															+    LOG.info("{}: Creating Job Attempt directory {}", getName(), path);
														
 
															+    createNewDirectory("Job setup", path);
														
 
															+    createNewDirectory("Creating task manifest dir", getTaskManifestDir());
														
 
															+    // delete any success marker if so instructed.
														
 
															+    if (deleteMarker) {
														
 
															+      delete(getStageConfig().getJobSuccessMarkerPath(), false);
														
 
															+    }
														
 
															+    return path;
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupTaskStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupTaskStage.java
@@ -0,0 +1,54 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_SETUP;
														
 
															+
														
 
															+/**
														
 
															+ * Stage to set up task.
														
 
															+ * This creates the task attempt directory, after verifying
														
 
															+ * that the job attempt dir exists (i.e. this is invoked
														
 
															+ * after the job is started and before any cleanup.
														
 
															+ * Argument passed in is task name:only for logging.
														
 
															+ */
														
 
															+public class SetupTaskStage extends
														
 
															+    AbstractJobOrTaskStage<String, Path> {
														
 
															+
														
 
															+  public SetupTaskStage(final StageConfig stageConfig) {
														
 
															+    super(true, stageConfig, OP_STAGE_TASK_SETUP, false);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set up a task.
														
 
															+   * @param name task name (for logging)
														
 
															+   * @return task attempt directory
														
 
															+   * @throws IOException IO failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected Path executeStage(final String name) throws IOException {
														
 
															+    return createNewDirectory("Task setup " + name,
														
 
															+        requireNonNull(getTaskAttemptDir(), "No task attempt directory"));
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java
@@ -0,0 +1,556 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
														
 
															+import org.apache.hadoop.util.JsonSerialization;
														
 
															+import org.apache.hadoop.util.Preconditions;
														
 
															+import org.apache.hadoop.util.Progressable;
														
 
															+import org.apache.hadoop.util.functional.TaskPool;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER;
														
 
															+
														
 
															+/**
														
 
															+ * Stage Config.
														
 
															+ * Everything to configure a stage which is common to all.
														
 
															+ *
														
 
															+ * It's isolated from the details of MR datatypes (taskID, taskattempt etc);
														
 
															+ * at this point it expects parsed values.
														
 
															+ *
														
 
															+ * It uses the builder API, but once {@link #build()} is called it goes
														
 
															+ * read only. This is to ensure that changes cannot
														
 
															+ * take place when shared across stages.
														
 
															+ */
														
 
															+public class StageConfig {
														
 
															+
														
 
															+  /**
														
 
															+   * A flag which freezes the config for
														
 
															+   * further updates.
														
 
															+   */
														
 
															+  private boolean frozen;
														
 
															+
														
 
															+  /**
														
 
															+   * IOStatistics to update.
														
 
															+   */
														
 
															+  private IOStatisticsStore iostatistics;
														
 
															+
														
 
															+  /**
														
 
															+   * Job ID; constant over multiple attempts.
														
 
															+   */
														
 
															+  private String jobId;
														
 
															+
														
 
															+  /**
														
 
															+   * Where did the job Unique ID come from?
														
 
															+   */
														
 
															+  private String jobIdSource = "";
														
 
															+
														
 
															+  /**
														
 
															+   * Number of the job attempt; starts at zero.
														
 
															+   */
														
 
															+  private int jobAttemptNumber;
														
 
															+
														
 
															+  /**
														
 
															+   * ID of the task.
														
 
															+   */
														
 
															+  private String taskId;
														
 
															+
														
 
															+  /**
														
 
															+   * ID of this specific attempt at a task.
														
 
															+   */
														
 
															+  private String taskAttemptId;
														
 
															+
														
 
															+  /**
														
 
															+   * Destination of job.
														
 
															+   */
														
 
															+  private Path destinationDir;
														
 
															+
														
 
															+  /**
														
 
															+   * Job attempt dir.
														
 
															+   */
														
 
															+  private Path jobAttemptDir;
														
 
															+
														
 
															+  /**
														
 
															+   * temp directory under job dest dir.
														
 
															+   */
														
 
															+  private Path outputTempSubDir;
														
 
															+
														
 
															+  /**
														
 
															+   * Task attempt dir.
														
 
															+   */
														
 
															+  private Path taskAttemptDir;
														
 
															+
														
 
															+  /**
														
 
															+   * directory where task manifests must go.
														
 
															+   */
														
 
															+  private Path taskManifestDir;
														
 
															+
														
 
															+  /**
														
 
															+   * Subdir under the job attempt dir where task
														
 
															+   * attempts will have subdirectories.
														
 
															+   */
														
 
															+  private Path jobAttemptTaskSubDir;
														
 
															+
														
 
															+  /**
														
 
															+   * Callbacks to update store.
														
 
															+   * This is not made visible to the stages; they must
														
 
															+   * go through the superclass which
														
 
															+   * adds statistics and logging.
														
 
															+   */
														
 
															+  private ManifestStoreOperations operations;
														
 
															+
														
 
															+  /**
														
 
															+   * Submitter for doing IO against the store other than
														
 
															+   * manifest processing.
														
 
															+   */
														
 
															+  private TaskPool.Submitter ioProcessors;
														
 
															+
														
 
															+  /**
														
 
															+   * Optional progress callback.
														
 
															+   */
														
 
															+  private Progressable progressable;
														
 
															+
														
 
															+  /**
														
 
															+   * Callback when a stage is entered.
														
 
															+   */
														
 
															+  private StageEventCallbacks enterStageEventHandler;
														
 
															+
														
 
															+  /**
														
 
															+   * Thread local serializer; created on demand
														
 
															+   * and shareable across a sequence of stages.
														
 
															+   */
														
 
															+  private final ThreadLocal<JsonSerialization<TaskManifest>> threadLocalSerializer =
														
 
															+      ThreadLocal.withInitial(TaskManifest::serializer);
														
 
															+
														
 
															+  /**
														
 
															+   * Delete target paths on commit? Stricter, but
														
 
															+   * higher IO cost.
														
 
															+   */
														
 
															+  private boolean deleteTargetPaths;
														
 
															+
														
 
															+  /**
														
 
															+   * Name for logging.
														
 
															+   */
														
 
															+  private String name = "";
														
 
															+
														
 
															+  public StageConfig() {
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify that the config is not yet frozen.
														
 
															+   */
														
 
															+  private void checkOpen() {
														
 
															+    Preconditions.checkState(!frozen,
														
 
															+        "StageConfig is now read-only");
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * The build command makes the config immutable.
														
 
															+   * Idempotent.
														
 
															+   * @return the now-frozen config
														
 
															+   */
														
 
															+  public StageConfig build() {
														
 
															+    frozen = true;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set job destination dir.
														
 
															+   * @param dir new dir
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withDestinationDir(final Path dir) {
														
 
															+    destinationDir = dir;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set IOStatistics store.
														
 
															+   * @param store new store
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withIOStatistics(final IOStatisticsStore store) {
														
 
															+    checkOpen();
														
 
															+    iostatistics = store;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set builder value.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withIOProcessors(final TaskPool.Submitter value) {
														
 
															+    checkOpen();
														
 
															+    ioProcessors = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set Job attempt directory.
														
 
															+   * @param dir new dir
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withJobAttemptDir(final Path dir) {
														
 
															+    checkOpen();
														
 
															+    jobAttemptDir = dir;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Directory to put task manifests into.
														
 
															+   * @return a path under the job attempt dir.
														
 
															+   */
														
 
															+  public Path getTaskManifestDir() {
														
 
															+    return taskManifestDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set builder value.
														
 
															+   * @param value new value
														
 
															+   * @return the builder
														
 
															+   */
														
 
															+  public StageConfig withTaskManifestDir(Path value) {
														
 
															+    checkOpen();
														
 
															+    taskManifestDir = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set builder value.
														
 
															+   * @param value new value
														
 
															+   * @return the builder
														
 
															+   */
														
 
															+  public StageConfig withJobAttemptTaskSubDir(Path value) {
														
 
															+    jobAttemptTaskSubDir = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the path to the subdirectory under $jobID where task
														
 
															+   * attempts are. List this dir to find all task attempt dirs.
														
 
															+   * @return a path under the job attempt dir.
														
 
															+   */
														
 
															+  public Path getJobAttemptTaskSubDir() {
														
 
															+    return jobAttemptTaskSubDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the job directories from the attempt directories
														
 
															+   * information. Does not set task attempt fields.
														
 
															+   * @param dirs source of directories.
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withJobDirectories(
														
 
															+      final ManifestCommitterSupport.AttemptDirectories dirs) {
														
 
															+
														
 
															+    checkOpen();
														
 
															+    withJobAttemptDir(dirs.getJobAttemptDir())
														
 
															+        .withJobAttemptTaskSubDir(dirs.getJobAttemptTaskSubDir())
														
 
															+        .withDestinationDir(dirs.getOutputPath())
														
 
															+        .withOutputTempSubDir(dirs.getOutputTempSubDir())
														
 
															+        .withTaskManifestDir(dirs.getTaskManifestDir());
														
 
															+
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set job ID with no attempt included.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withJobId(final String value) {
														
 
															+    checkOpen();
														
 
															+    jobId = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  public Path getOutputTempSubDir() {
														
 
															+    return outputTempSubDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set builder value.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withOutputTempSubDir(final Path value) {
														
 
															+    checkOpen();
														
 
															+    outputTempSubDir = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set builder value.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withOperations(final ManifestStoreOperations value) {
														
 
															+    checkOpen();
														
 
															+    operations = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set builder value.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withTaskAttemptId(final String value) {
														
 
															+    checkOpen();
														
 
															+    taskAttemptId = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set builder value.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withTaskId(final String value) {
														
 
															+    checkOpen();
														
 
															+    taskId = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set handler for stage entry events..
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withStageEventCallbacks(StageEventCallbacks value) {
														
 
															+    checkOpen();
														
 
															+    enterStageEventHandler = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Optional progress callback.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withProgressable(final Progressable value) {
														
 
															+    checkOpen();
														
 
															+    progressable = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the Task attempt directory.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withTaskAttemptDir(final Path value) {
														
 
															+    checkOpen();
														
 
															+    taskAttemptDir = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the job attempt number.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withJobAttemptNumber(final int value) {
														
 
															+    checkOpen();
														
 
															+    jobAttemptNumber = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the Job ID source.
														
 
															+   * @param value new value
														
 
															+   * @return this
														
 
															+   */
														
 
															+  public StageConfig withJobIdSource(final String value) {
														
 
															+    checkOpen();
														
 
															+    jobIdSource = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set name of task/job.
														
 
															+   * @param value new value
														
 
															+   * @return the builder
														
 
															+   */
														
 
															+  public StageConfig withName(String value) {
														
 
															+    name = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get name of task/job.
														
 
															+   * @return name for logging.
														
 
															+   */
														
 
															+  public String getName() {
														
 
															+    return name;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Handler for stage entry events.
														
 
															+   * @return the handler.
														
 
															+   */
														
 
															+  public StageEventCallbacks getEnterStageEventHandler() {
														
 
															+    return enterStageEventHandler;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * IOStatistics to update.
														
 
															+   */
														
 
															+  public IOStatisticsStore getIOStatistics() {
														
 
															+    return iostatistics;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Job ID.
														
 
															+   */
														
 
															+  public String getJobId() {
														
 
															+    return jobId;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * ID of the task.
														
 
															+   */
														
 
															+  public String getTaskId() {
														
 
															+    return taskId;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * ID of this specific attempt at a task.
														
 
															+   */
														
 
															+  public String getTaskAttemptId() {
														
 
															+    return taskAttemptId;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Job attempt dir.
														
 
															+   */
														
 
															+  public Path getJobAttemptDir() {
														
 
															+    return jobAttemptDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Destination of job.
														
 
															+   */
														
 
															+  public Path getDestinationDir() {
														
 
															+    return destinationDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the location of the success marker.
														
 
															+   * @return a path under the destination directory.
														
 
															+   */
														
 
															+  public Path getJobSuccessMarkerPath() {
														
 
															+    return new Path(destinationDir, SUCCESS_MARKER);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Callbacks to update store.
														
 
															+   * This is not made visible to the stages; they must
														
 
															+   * go through the wrapper classes in this class, which
														
 
															+   * add statistics and logging.
														
 
															+   */
														
 
															+  public ManifestStoreOperations getOperations() {
														
 
															+    return operations;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Submitter for doing IO against the store other than
														
 
															+   * manifest processing.
														
 
															+   */
														
 
															+  public TaskPool.Submitter getIoProcessors() {
														
 
															+    return ioProcessors;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get optional progress callback.
														
 
															+   * @return callback or null
														
 
															+   */
														
 
															+  public Progressable getProgressable() {
														
 
															+    return progressable;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Task attempt directory.
														
 
															+   * @return the task attempt dir.
														
 
															+   */
														
 
															+  public Path getTaskAttemptDir() {
														
 
															+    return taskAttemptDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the job attempt number.
														
 
															+   * @return the value
														
 
															+   */
														
 
															+  public int getJobAttemptNumber() {
														
 
															+    return jobAttemptNumber;
														
 
															+  }
														
 
															+
														
 
															+  public String getJobIdSource() {
														
 
															+    return jobIdSource;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get a thread local task manifest serializer.
														
 
															+   * @return a serializer.
														
 
															+   */
														
 
															+  public JsonSerialization<TaskManifest> currentManifestSerializer() {
														
 
															+    return threadLocalSerializer.get();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set builder value.
														
 
															+   * @param value new value
														
 
															+   * @return the builder
														
 
															+   */
														
 
															+  public StageConfig withDeleteTargetPaths(boolean value) {
														
 
															+    checkOpen();
														
 
															+    deleteTargetPaths = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  public boolean getDeleteTargetPaths() {
														
 
															+    return deleteTargetPaths;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Enter the stage; calls back to
														
 
															+   * {@link #enterStageEventHandler} if non-null.
														
 
															+   * @param stage stage entered
														
 
															+   */
														
 
															+  public void enterStage(String stage) {
														
 
															+    if (enterStageEventHandler != null) {
														
 
															+      enterStageEventHandler.enterStage(stage);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Exit the stage; calls back to
														
 
															+   * {@link #enterStageEventHandler} if non-null.
														
 
															+   * @param stage stage entered
														
 
															+   */
														
 
															+  public void exitStage(String stage) {
														
 
															+    if (enterStageEventHandler != null) {
														
 
															+      enterStageEventHandler.exitStage(stage);
														
 
															+    }
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageEventCallbacks.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageEventCallbacks.java
@@ -0,0 +1,37 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+/**
														
 
															+ * Callbacks on stage entry/exit.
														
 
															+ */
														
 
															+public interface StageEventCallbacks {
														
 
															+
														
 
															+  /**
														
 
															+   * Callback on stage entry.
														
 
															+   * @param stage new stage
														
 
															+   */
														
 
															+  void enterStage(String stage);
														
 
															+
														
 
															+  /**
														
 
															+   * Callback on stage exit.
														
 
															+   * @param stage stage exited.
														
 
															+   */
														
 
															+  void exitStage(String stage);
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/TaskAttemptScanDirectoryStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/TaskAttemptScanDirectoryStage.java
@@ -0,0 +1,214 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.List;
														
 
															+import java.util.LongSummaryStatistics;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.RemoteIterator;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.EntryStatus;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.util.DurationInfo;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_DIRECTORY_COUNT_MEAN;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_DIRECTORY_DEPTH_MEAN;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_FILE_COUNT_MEAN;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_FILE_SIZE_MEAN;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_SCAN_DIRECTORY;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createTaskManifest;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.maybeAddIOStatistics;
														
 
															+
														
 
															+/**
														
 
															+ * Stage to scan a directory tree and build a task manifest.
														
 
															+ * This is executed by the task committer.
														
 
															+ */
														
 
															+public final class TaskAttemptScanDirectoryStage
														
 
															+    extends AbstractJobOrTaskStage<Void, TaskManifest> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      TaskAttemptScanDirectoryStage.class);
														
 
															+
														
 
															+  public TaskAttemptScanDirectoryStage(
														
 
															+      final StageConfig stageConfig) {
														
 
															+    super(true, stageConfig, OP_STAGE_TASK_SCAN_DIRECTORY, false);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Build the Manifest.
														
 
															+   * @return the manifest
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected TaskManifest executeStage(final Void arguments)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    final Path taskAttemptDir = getRequiredTaskAttemptDir();
														
 
															+    final TaskManifest manifest = createTaskManifest(getStageConfig());
														
 
															+
														
 
															+    LOG.info("{}: scanning directory {}",
														
 
															+        getName(), taskAttemptDir);
														
 
															+
														
 
															+    final int depth = scanDirectoryTree(manifest,
														
 
															+        taskAttemptDir,
														
 
															+        getDestinationDir(),
														
 
															+        0, true);
														
 
															+    List<FileEntry> filesToCommit = manifest.getFilesToCommit();
														
 
															+    LongSummaryStatistics fileSummary = filesToCommit.stream()
														
 
															+        .mapToLong(FileEntry::getSize)
														
 
															+        .summaryStatistics();
														
 
															+    long fileDataSize = fileSummary.getSum();
														
 
															+    long fileCount = fileSummary.getCount();
														
 
															+    int dirCount = manifest.getDestDirectories().size();
														
 
															+    LOG.info("{}: directory {} contained {} file(s); data size {}",
														
 
															+        getName(),
														
 
															+        taskAttemptDir,
														
 
															+        fileCount,
														
 
															+        fileDataSize);
														
 
															+    LOG.info("{}: Directory count = {}; maximum depth {}",
														
 
															+        getName(),
														
 
															+        dirCount,
														
 
															+        depth);
														
 
															+    // add statistics about the task output which, when aggregated, provides
														
 
															+    // insight into structure of job, task skew, etc.
														
 
															+    IOStatisticsStore iostats = getIOStatistics();
														
 
															+    iostats.addSample(COMMITTER_TASK_DIRECTORY_COUNT_MEAN, dirCount);
														
 
															+    iostats.addSample(COMMITTER_TASK_DIRECTORY_DEPTH_MEAN, depth);
														
 
															+    iostats.addSample(COMMITTER_TASK_FILE_COUNT_MEAN, fileCount);
														
 
															+    iostats.addSample(COMMITTER_TASK_FILE_SIZE_MEAN, fileDataSize);
														
 
															+
														
 
															+    return manifest;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Recursively scan a directory tree.
														
 
															+   * The manifest will contain all files to rename
														
 
															+   * (source and dest) and directories to create.
														
 
															+   * All files are processed before any of the subdirs are.
														
 
															+   * This helps in statistics gathering.
														
 
															+   * There's some optimizations which could be done with async
														
 
															+   * fetching of the iterators of those subdirs, but as this
														
 
															+   * is generally off-critical path then that "enhancement"
														
 
															+   * can be postponed until data suggests this needs improvement.
														
 
															+   * @param manifest manifest to update
														
 
															+   * @param srcDir dir to scan
														
 
															+   * @param destDir destination directory
														
 
															+   * @param depth depth from the task attempt dir.
														
 
															+   * @param parentDirExists does the parent dir exist?
														
 
															+   * @return the maximum depth of child directories
														
 
															+   * @throws IOException IO failure.
														
 
															+   */
														
 
															+  private int scanDirectoryTree(
														
 
															+      TaskManifest manifest,
														
 
															+      Path srcDir,
														
 
															+      Path destDir,
														
 
															+      int depth,
														
 
															+      boolean parentDirExists) throws IOException {
														
 
															+
														
 
															+    // generate some task progress in case directory scanning is very slow.
														
 
															+    progress();
														
 
															+
														
 
															+    int maxDepth = 0;
														
 
															+    int files = 0;
														
 
															+    boolean dirExists = parentDirExists;
														
 
															+    List<FileStatus> subdirs = new ArrayList<>();
														
 
															+    try (DurationInfo ignored = new DurationInfo(LOG, false,
														
 
															+        "Task Attempt %s source dir %s, dest dir %s",
														
 
															+        getTaskAttemptId(), srcDir, destDir)) {
														
 
															+
														
 
															+      // list the directory. This may block until the listing is complete,
														
 
															+      // or, if the FS does incremental or asynchronous fetching,
														
 
															+      // then the next()/hasNext() call will block for the results
														
 
															+      // unless turned off, ABFS does to this async
														
 
															+      final RemoteIterator<FileStatus> listing = listStatusIterator(srcDir);
														
 
															+
														
 
															+      // when the FS (especially ABFS) does an asyn fetch of the listing,
														
 
															+      // we can probe for the status of the destination dir while that
														
 
															+      // page is being fetched.
														
 
															+      // probe for and add the dest dir entry for all but
														
 
															+      // the base dir
														
 
															+
														
 
															+      if (depth > 0) {
														
 
															+        final EntryStatus status;
														
 
															+        if (parentDirExists) {
														
 
															+          final FileStatus destDirStatus = getFileStatusOrNull(destDir);
														
 
															+          status = EntryStatus.toEntryStatus(destDirStatus);
														
 
															+          dirExists = destDirStatus != null;
														
 
															+        } else {
														
 
															+          // if there is no parent dir, then there is no need to look
														
 
															+          // for this directory -report it as missing automatically.
														
 
															+          status = EntryStatus.not_found;
														
 
															+        }
														
 
															+        manifest.addDirectory(DirEntry.dirEntry(
														
 
															+            destDir,
														
 
															+            status,
														
 
															+            depth));
														
 
															+      }
														
 
															+
														
 
															+      // process the listing; this is where abfs will block
														
 
															+      // to wait the result of the list call.
														
 
															+      while (listing.hasNext()) {
														
 
															+        final FileStatus st = listing.next();
														
 
															+        if (st.isFile()) {
														
 
															+          // this is a file, so add to the list of files to commit.
														
 
															+          files++;
														
 
															+          final FileEntry entry = fileEntry(st, destDir);
														
 
															+          manifest.addFileToCommit(entry);
														
 
															+          LOG.debug("To rename: {}", entry);
														
 
															+        } else {
														
 
															+          if (st.isDirectory()) {
														
 
															+            // will need to scan this directory too.
														
 
															+            subdirs.add(st);
														
 
															+          } else {
														
 
															+            // some other object. ignoring
														
 
															+            LOG.info("Ignoring FS object {}", st);
														
 
															+          }
														
 
															+        }
														
 
															+      }
														
 
															+      // add any statistics provided by the listing.
														
 
															+      maybeAddIOStatistics(getIOStatistics(), listing);
														
 
															+    }
														
 
															+
														
 
															+    // now scan the subdirectories
														
 
															+    LOG.debug("{}: Number of subdirectories under {} found: {}; file count {}",
														
 
															+        getName(), srcDir, subdirs.size(), files);
														
 
															+
														
 
															+    for (FileStatus st : subdirs) {
														
 
															+      Path destSubDir = new Path(destDir, st.getPath().getName());
														
 
															+      final int d = scanDirectoryTree(manifest,
														
 
															+          st.getPath(),
														
 
															+          destSubDir,
														
 
															+          depth + 1,
														
 
															+          dirExists);
														
 
															+      maxDepth = Math.max(maxDepth, d);
														
 
															+    }
														
 
															+
														
 
															+    return 1 + maxDepth;
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/ValidateRenamedFilesStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/ValidateRenamedFilesStage.java
@@ -0,0 +1,200 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import java.io.FileNotFoundException;
														
 
															+import java.io.IOException;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.List;
														
 
															+import java.util.concurrent.atomic.AtomicBoolean;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.OutputValidationException;
														
 
															+import org.apache.hadoop.util.functional.TaskPool;
														
 
															+
														
 
															+import static org.apache.commons.lang3.StringUtils.isNotBlank;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_VALIDATE_OUTPUT;
														
 
															+import static org.apache.hadoop.thirdparty.com.google.common.collect.Iterables.concat;
														
 
															+
														
 
															+/**
														
 
															+ * This stage validates all files by scanning the manifests
														
 
															+ * and verifying every file in every manifest is of the given size.
														
 
															+ * Returns a list of all files committed.
														
 
															+ *
														
 
															+ * Its cost is one getFileStatus() call (parallelized) per file.
														
 
															+ * Raises a {@link OutputValidationException} on a validation failure.
														
 
															+ */
														
 
															+public class ValidateRenamedFilesStage extends
														
 
															+    AbstractJobOrTaskStage<
														
 
															+        List<TaskManifest>,
														
 
															+        List<FileEntry>> {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      ValidateRenamedFilesStage.class);
														
 
															+
														
 
															+  /**
														
 
															+   * Set this to halt all workers.
														
 
															+   */
														
 
															+  private final AtomicBoolean halt = new AtomicBoolean();
														
 
															+
														
 
															+  /**
														
 
															+   * List of all files committed.
														
 
															+   */
														
 
															+  private List<FileEntry> filesCommitted = new ArrayList<>();
														
 
															+
														
 
															+  public ValidateRenamedFilesStage(final StageConfig stageConfig) {
														
 
															+    super(false, stageConfig, OP_STAGE_JOB_VALIDATE_OUTPUT, true);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the list of files committed.
														
 
															+   * @return a possibly empty list.
														
 
															+   */
														
 
															+  private synchronized List<FileEntry> getFilesCommitted() {
														
 
															+    return filesCommitted;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a file entry to the list of committed files.
														
 
															+   * @param entry entry
														
 
															+   */
														
 
															+  private synchronized void addFileCommitted(FileEntry entry) {
														
 
															+    filesCommitted.add(entry);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Validate the task manifests.
														
 
															+   * This is done by listing all the directories
														
 
															+   * and verifying that every file in the source list
														
 
															+   * has a file in the destination of the same size.
														
 
															+   * If two tasks have both written the same file or
														
 
															+   * a source file was changed after the task was committed,
														
 
															+   * then a mistmatch will be detected -provided the file
														
 
															+   * length is now different.
														
 
															+   * @param taskManifests list of manifests.
														
 
															+   * @return list of files committed.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected List<FileEntry> executeStage(
														
 
															+      final List<TaskManifest> taskManifests)
														
 
															+      throws IOException {
														
 
															+
														
 
															+    // set the list of files to be as big as the number of tasks.
														
 
															+    // synchronized to stop complaints.
														
 
															+    synchronized (this) {
														
 
															+      filesCommitted = new ArrayList<>(taskManifests.size());
														
 
															+    }
														
 
															+
														
 
															+    // validate all the files.
														
 
															+
														
 
															+    final Iterable<FileEntry> filesToCommit = concat(taskManifests.stream()
														
 
															+        .map(TaskManifest::getFilesToCommit)
														
 
															+        .collect(Collectors.toList()));
														
 
															+
														
 
															+    TaskPool.foreach(filesToCommit)
														
 
															+        .executeWith(getIOProcessors())
														
 
															+        .stopOnFailure()
														
 
															+        .run(this::validateOneFile);
														
 
															+
														
 
															+    return getFilesCommitted();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Validate a file.
														
 
															+   * @param entry entry to probe for
														
 
															+   * @throws IOException IO problem.
														
 
															+   * @throws OutputValidationException if the entry is not valid
														
 
															+   */
														
 
															+  private void validateOneFile(FileEntry entry) throws IOException {
														
 
															+    updateAuditContext(OP_STAGE_JOB_VALIDATE_OUTPUT);
														
 
															+
														
 
															+    if (halt.get()) {
														
 
															+      // told to stop
														
 
															+      return;
														
 
															+    }
														
 
															+    // report progress back
														
 
															+    progress();
														
 
															+    // look validate the file.
														
 
															+    // raising an FNFE if the file isn't there.
														
 
															+    FileStatus destStatus;
														
 
															+    final Path sourcePath = entry.getSourcePath();
														
 
															+    Path destPath = entry.getDestPath();
														
 
															+    try {
														
 
															+      destStatus = getFileStatus(destPath);
														
 
															+
														
 
															+      // it must be a file
														
 
															+      if (!destStatus.isFile()) {
														
 
															+        throw new OutputValidationException(destPath,
														
 
															+            "Expected a file renamed from " + sourcePath
														
 
															+                + "; found " + destStatus);
														
 
															+      }
														
 
															+      final long sourceSize = entry.getSize();
														
 
															+      final long destSize = destStatus.getLen();
														
 
															+
														
 
															+      // etags, if the source had one.
														
 
															+      final String sourceEtag = entry.getEtag();
														
 
															+      if (isNotBlank(sourceEtag)) {
														
 
															+        final String destEtag = ManifestCommitterSupport.getEtag(destStatus);
														
 
															+        if (!sourceEtag.equals(destEtag)) {
														
 
															+          LOG.warn("Etag of dest file {}: {} does not match that of manifest entry {}",
														
 
															+              destPath, destStatus, entry);
														
 
															+          throw new OutputValidationException(destPath,
														
 
															+              String.format("Expected the file"
														
 
															+                      + " renamed from %s"
														
 
															+                      + " with etag %s and length %s"
														
 
															+                      + " but found a file with etag %s and length %d",
														
 
															+                  sourcePath,
														
 
															+                  sourceEtag,
														
 
															+                  sourceSize,
														
 
															+                  destEtag,
														
 
															+                  destSize));
														
 
															+
														
 
															+        }
														
 
															+      }
														
 
															+      // check the expected length after any etag validation
														
 
															+      if (destSize != sourceSize) {
														
 
															+        LOG.warn("Length of dest file {}: {} does not match that of manifest entry {}",
														
 
															+            destPath, destStatus, entry);
														
 
															+        throw new OutputValidationException(destPath,
														
 
															+            String.format("Expected the file"
														
 
															+                    + " renamed from %s"
														
 
															+                    + " with length %d"
														
 
															+                    + " but found a file of length %d",
														
 
															+                sourcePath,
														
 
															+                sourceSize,
														
 
															+                destSize));
														
 
															+      }
														
 
															+
														
 
															+    } catch (FileNotFoundException e) {
														
 
															+      // file didn't exist
														
 
															+      throw new OutputValidationException(destPath,
														
 
															+          "Expected a file, but it was not found", e);
														
 
															+    }
														
 
															+    addFileCommitted(entry);
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/package-info.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/package-info.java
@@ -0,0 +1,30 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+/**
														
 
															+ * Intermediate manifest committer.
														
 
															+ *
														
 
															+ * The individual and aggregate stages of the task and job commit process.
														
 
															+ */
														
 
															+
														
 
															+@InterfaceAudience.Private
														
 
															+@InterfaceStability.Unstable
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml
@@ -2231,4 +2231,23 @@
 
															   </description>
														
 
															 </property>
														
 
															+<!-- not yet enabled by default.
														
 
															+
														
 
															+<property>
														
 
															+  <name>mapreduce.outputcommitter.factory.scheme.abfs</name>
														
 
															+  <value>org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory</value>
														
 
															+  <description>
														
 
															+    The default committer factory for ABFS is for the manifest committer with
														
 
															+    abfs-specific tuning.
														
 
															+  </description>
														
 
															+</property>
														
 
															+
														
 
															+<property>
														
 
															+  <name>mapreduce.outputcommitter.factory.scheme.gs</name>
														
 
															+  <value>org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory</value>
														
 
															+  <description>
														
 
															+    The default committer factory for google cloud storage is for the manifest committer.
														
 
															+  </description>
														
 
															+</property>
														
 
															+-->
														
 
															 </configuration>
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md
@@ -0,0 +1,605 @@
 
															+<!---
														
 
															+  Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+  you may not use this file except in compliance with the License.
														
 
															+  You may obtain a copy of the License at
														
 
															+
														
 
															+   http://www.apache.org/licenses/LICENSE-2.0
														
 
															+
														
 
															+  Unless required by applicable law or agreed to in writing, software
														
 
															+  distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+  See the License for the specific language governing permissions and
														
 
															+  limitations under the License. See accompanying LICENSE file.
														
 
															+-->
														
 
															+
														
 
															+
														
 
															+# The Manifest Committer for Azure and Google Cloud Storage
														
 
															+
														
 
															+This document how to use the _Manifest Committer_.
														
 
															+
														
 
															+The _Manifest_ committer is a committer for work which provides
														
 
															+performance on ABFS for "real world" queries,
														
 
															+and performance and correctness on GCS.
														
 
															+It also works with other filesystems, including HDFS.
														
 
															+However, the design is optimized for object stores where
														
 
															+listing operatons are slow and expensive.
														
 
															+
														
 
															+The architecture and implementation of the committer is covered in
														
 
															+[Manifest Committer Architecture](manifest_committer_architecture.html).
														
 
															+
														
 
															+
														
 
															+The protocol and its correctness are covered in
														
 
															+[Manifest Committer Protocol](manifest_committer_protocol.html).
														
 
															+
														
 
															+It was added in March 2022, and should be considered unstable
														
 
															+in early releases.
														
 
															+
														
 
															+<!-- MACRO{toc|fromDepth=0|toDepth=2} -->
														
 
															+
														
 
															+## Problem:
														
 
															+
														
 
															+The only committer of work from Spark to Azure ADLS Gen 2 "abfs://" storage
														
 
															+which is safe to use is the "v1 file committer".
														
 
															+
														
 
															+This is "correct" in that if a task attempt fails, its output is guaranteed not
														
 
															+to be included in the final out. The "v2" commit algorithm cannot meet that
														
 
															+guarantee, which is why it is no longer the default.
														
 
															+
														
 
															+But: it is slow, especially on jobs where deep directory trees of output are used.
														
 
															+Why is it slow? It's hard to point at a particular cause, primarily because of
														
 
															+the lack of any instrumentation in the `FileOutputCommitter`.
														
 
															+Stack traces of running jobs generally show `rename()`, though list operations
														
 
															+do surface too.
														
 
															+
														
 
															+On Google GCS, neither the v1 nor v2 algorithm are _safe_ because the google
														
 
															+filesystem doesn't have the atomic directory rename which the v1 algorithm
														
 
															+requires.
														
 
															+
														
 
															+A further issue is that both Azure and GCS storage may encounter scale issues
														
 
															+with deleting directories with many descendants.
														
 
															+This can trigger timeouts because the FileOutputCommitter assumes that
														
 
															+cleaning up after the job is a fast call to `delete("_temporary", true)`.
														
 
															+
														
 
															+## Solution.
														
 
															+
														
 
															+The _Intermediate Manifest_ committer is a new committer for
														
 
															+work which should deliver performance on ABFS
														
 
															+for "real world" queries, and performance and correctness on GCS.
														
 
															+
														
 
															+This committer uses the extension point which came in for the S3A committers.
														
 
															+Users can declare a new committer factory for abfs:// and gcs:// URLs.
														
 
															+A suitably configured spark deployment will pick up the new committer.
														
 
															+
														
 
															+Directory performance issues in job cleanup can be addressed by two options
														
 
															+1. The committer will parallelize deletion of task attempt directories before
														
 
															+   deleting the `_temporary` directory.
														
 
															+1. Cleanup can be disabled. .
														
 
															+
														
 
															+The committer can be used with any filesystem client which has a "real" file rename()
														
 
															+operation.
														
 
															+It has been optimised for remote object stores where listing and file probes
														
 
															+are expensive -the design is less likely to offer such signifcant speedup
														
 
															+on HDFS -though the parallel renaming operations will speed up jobs
														
 
															+there compared to the classic v1 algorithm.
														
 
															+
														
 
															+# <a name="how"></a> How it works
														
 
															+
														
 
															+The full details are covered in [Manifest Committer Architecture](manifest_committer_architecture.html).
														
 
															+
														
 
															+# <a name="use"></a> Using the committer
														
 
															+
														
 
															+The hooks put in to support the S3A committers were designed to allow every
														
 
															+filesystem schema to provide their own committer.
														
 
															+See [Switching To an S3A Committer](../../hadoop-aws/tools/hadoop-aws/committers.html#Switching_to_an_S3A_Committer)
														
 
															+
														
 
															+A factory for the abfs schema would be defined in
														
 
															+`mapreduce.outputcommitter.factory.scheme.abfs` ; and a similar one for `gcs`.
														
 
															+
														
 
															+Some matching spark configuration changes, especially for parquet binding, will be required.
														
 
															+These can be done in `core-site.xml`, if it is not defined in the `mapred-default.xml` JAR.
														
 
															+
														
 
															+
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>mapreduce.outputcommitter.factory.scheme.abfs</name>
														
 
															+  <value>org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory</value>
														
 
															+</property>
														
 
															+<property>
														
 
															+  <name>mapreduce.outputcommitter.factory.scheme.gs</name>
														
 
															+  <value>org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory</value>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+## Binding to the manifest committer in Spark.
														
 
															+
														
 
															+In Apache Spark, the configuration can be done either with command line options (after the '--conf') or by using the `spark-defaults.conf` file. The following is an example of using `spark-defaults.conf` also including the configuration for Parquet with a subclass of the parquet
														
 
															+committer which uses the factory mechansim internally.
														
 
															+
														
 
															+```
														
 
															+spark.hadoop.mapreduce.outputcommitter.factory.scheme.abfs org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory
														
 
															+spark.hadoop.mapreduce.outputcommitter.factory.scheme.gs org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory
														
 
															+spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
														
 
															+spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
														
 
															+```
														
 
															+
														
 
															+
														
 
															+### <a name="committerinfo"></a> Using the Cloudstore `committerinfo` command to probe committer bindings.
														
 
															+
														
 
															+The hadoop committer settings can be validated in a recent build of [cloudstore](https://github.com/steveloughran/cloudstore)
														
 
															+and its `committerinfo` command.
														
 
															+This command instantiates a committer for that path through the same factory mechanism as MR and spark jobs use,
														
 
															+then prints its `toString` value.
														
 
															+
														
 
															+```
														
 
															+hadoop jar cloudstore-1.0.jar committerinfo abfs://testing@ukwest.dfs.core.windows.net/
														
 
															+
														
 
															+2021-09-16 19:42:59,731 [main] INFO  commands.CommitterInfo (StoreDurationInfo.java:<init>(53)) - Starting: Create committer
														
 
															+Committer factory for path abfs://testing@ukwest.dfs.core.windows.net/ is
														
 
															+ org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory@3315d2d7
														
 
															+  (classname org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory)
														
 
															+2021-09-16 19:43:00,897 [main] INFO  manifest.ManifestCommitter (ManifestCommitter.java:<init>(144)) - Created ManifestCommitter with
														
 
															+   JobID job__0000, Task Attempt attempt__0000_r_000000_1 and destination abfs://testing@ukwest.dfs.core.windows.net/
														
 
															+Created committer of class org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitter:
														
 
															+ ManifestCommitter{ManifestCommitterConfig{destinationDir=abfs://testing@ukwest.dfs.core.windows.net/,
														
 
															+   role='task committer',
														
 
															+   taskAttemptDir=abfs://testing@ukwest.dfs.core.windows.net/_temporary/manifest_job__0000/0/_temporary/attempt__0000_r_000000_1,
														
 
															+   createJobMarker=true,
														
 
															+   jobUniqueId='job__0000',
														
 
															+   jobUniqueIdSource='JobID',
														
 
															+   jobAttemptNumber=0,
														
 
															+   jobAttemptId='job__0000_0',
														
 
															+   taskId='task__0000_r_000000',
														
 
															+   taskAttemptId='attempt__0000_r_000000_1'},
														
 
															+   iostatistics=counters=();
														
 
															+
														
 
															+gauges=();
														
 
															+
														
 
															+minimums=();
														
 
															+
														
 
															+maximums=();
														
 
															+
														
 
															+means=();
														
 
															+}
														
 
															+
														
 
															+```
														
 
															+
														
 
															+
														
 
															+## Verifying that the committer was used
														
 
															+
														
 
															+The new committer will write a JSON summary of the operation, including statistics, in the `_SUCCESS` file.
														
 
															+
														
 
															+If this file exists and is zero bytes long: the classic `FileOutputCommitter` was used.
														
 
															+
														
 
															+If this file exists and is greater than zero bytes long, either the manifest committer was used,
														
 
															+or in the case of S3A filesystems, one of the S3A committers. They all use the same JSON format.
														
 
															+
														
 
															+# <a name="configuration"></a> Configuration options
														
 
															+
														
 
															+Here are the main configuration options of the committer.
														
 
															+
														
 
															+
														
 
															+| Option | Meaning | Default Value |
														
 
															+|--------|---------|---------------|
														
 
															+| `mapreduce.manifest.committer.delete.target.files` | Delete target files? | `false` |
														
 
															+| `mapreduce.manifest.committer.io.threads` | Thread count for parallel operations | `64` |
														
 
															+| `mapreduce.manifest.committer.summary.report.directory` | directory to save reports. | `""` |
														
 
															+| `mapreduce.manifest.committer.cleanup.parallel.delete` | Delete temporary directories in parallel | `true` |
														
 
															+| `mapreduce.fileoutputcommitter.cleanup.skipped` | Skip cleanup of `_temporary` directory| `false` |
														
 
															+| `mapreduce.fileoutputcommitter.cleanup-failures.ignored` | Ignore errors during cleanup | `false` |
														
 
															+| `mapreduce.fileoutputcommitter.marksuccessfuljobs` | Create a `_SUCCESS` marker file on successful completion. (and delete any existing one in job setup) | `true` |
														
 
															+
														
 
															+There are some more, as covered in the (Advanced)[#advanced] section.
														
 
															+
														
 
															+
														
 
															+## <a name="scaling"></a> Scaling jobs `mapreduce.manifest.committer.io.threads`
														
 
															+
														
 
															+The core reason that this committer is faster than the classic `FileOutputCommitter`
														
 
															+is that it tries to parallelize as much file IO as it can during job commit, specifically:
														
 
															+
														
 
															+* task manifest loading
														
 
															+* deletion of files where directories will be created
														
 
															+* directory creation
														
 
															+* file-by-file renaming
														
 
															+* deletion of task attempt directories in job cleanup
														
 
															+
														
 
															+These operations are all performed in the same thread pool, whose size is set
														
 
															+in the option `mapreduce.manifest.committer.io.threads`.
														
 
															+
														
 
															+Larger values may be used.
														
 
															+
														
 
															+XML
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>mapreduce.manifest.committer.io.threads</name>
														
 
															+  <value>200</value>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+spark-defaults.conf
														
 
															+```
														
 
															+spark.hadoop.mapreduce.manifest.committer.io.threads 200
														
 
															+```
														
 
															+
														
 
															+A larger value than that of the number of cores allocated to
														
 
															+the MapReduce AM or Spark Driver does not directly overload
														
 
															+the CPUs, as the threads are normally waiting for (slow) IO
														
 
															+against the object store/filesystem to complete.
														
 
															+
														
 
															+Caveats
														
 
															+* In Spark, multiple jobs may be committed in the same process,
														
 
															+  each of which will create their own thread pool during job
														
 
															+  commit or cleanup.
														
 
															+* Azure rate throttling may be triggered if too many IO requests
														
 
															+  are made against the store. The rate throttling option
														
 
															+  `mapreduce.manifest.committer.io.rate` can help avoid this.
														
 
															+
														
 
															+
														
 
															+## <a name="deleting"></a> Optional: deleting target files in Job Commit
														
 
															+
														
 
															+The classic `FileOutputCommitter` deletes files at the destination paths
														
 
															+before renaming the job's files into place.
														
 
															+
														
 
															+This is optional in the manifest committers, set in the option
														
 
															+`mapreduce.manifest.committer.delete.target.files` with a default value of `false`.
														
 
															+
														
 
															+This increases performance and is safe to use when all files created by a job
														
 
															+have unique filenames.
														
 
															+
														
 
															+Apache Spark does generate unique filenames for ORC and Parquet since
														
 
															+[SPARK-8406](https://issues.apache.org/jira/browse/SPARK-8406)
														
 
															+_Adding UUID to output file name to avoid accidental overwriting_
														
 
															+
														
 
															+Avoiding checks for/deleting target files saves one delete call per file being committed, so can
														
 
															+save a significant amount of store IO.
														
 
															+
														
 
															+When appending to existing tables, using formats other than ORC and parquet,
														
 
															+unless confident that unique identifiers
														
 
															+are added to each filename, enable deletion of the target files.
														
 
															+
														
 
															+```
														
 
															+spark.hadoop.mapreduce.manifest.committer.delete.target.files true
														
 
															+```
														
 
															+
														
 
															+*Note 1:* the committer will skip deletion operations when it
														
 
															+created the directory into which a file is to be renamed.
														
 
															+This makes it slightly more efficient, at least if jobs
														
 
															+appending data are creating and writing into new partitions.
														
 
															+
														
 
															+*Note 2:* the committer still requires tasks within a single
														
 
															+job to create unique files. This is foundational for
														
 
															+any job to generate correct data.
														
 
															+
														
 
															+# <a name="SUCCESS"></a> Job Summaries in `_SUCCESS` files
														
 
															+
														
 
															+The original hadoop committer creates a zero byte `_SUCCESS` file in the root of the output directory
														
 
															+unless disabled.
														
 
															+
														
 
															+This committer writes a JSON summary which includes
														
 
															+* The name of the committer.
														
 
															+* Diagnostics information.
														
 
															+* A list of some of the files created (for testing; a full list is excluded as it can get big).
														
 
															+* IO Statistics.
														
 
															+
														
 
															+If, after running a query, this `_SUCCESS` file is zero bytes long,
														
 
															+*the new committer has not been used*
														
 
															+
														
 
															+If it is not empty, then it can be examined.
														
 
															+
														
 
															+## <a name="printer"></a> Viewing `_SUCCESS` file files through the `ManifestPrinter` tool.
														
 
															+
														
 
															+The summary files are JSON, and can be viewed in any text editor.
														
 
															+
														
 
															+For a more succinct summary, including better display of statistics, use the `ManifestPrinter` tool.
														
 
															+
														
 
															+```bash
														
 
															+hadoop org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestPrinter <path>
														
 
															+```
														
 
															+
														
 
															+This works for the files saved at the base of an output directory, and
														
 
															+any reports saved to a report directory.
														
 
															+
														
 
															+## <a name="summaries"></a> Collecting Job Summaries `mapreduce.manifest.committer.summary.report.directory`
														
 
															+
														
 
															+The committer can be configured to save the `_SUCCESS` summary files to a report directory,
														
 
															+irrespective of whether the job succeed or failed, by setting a fileystem path in
														
 
															+the option `mapreduce.manifest.committer.summary.report.directory`.
														
 
															+
														
 
															+The path does not have to be on the same
														
 
															+store/filesystem as the destination of work. For example, a local fileystem could be used.
														
 
															+
														
 
															+XML
														
 
															+
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>mapreduce.manifest.committer.summary.report.directory</name>
														
 
															+  <value>file:///tmp/reports</value>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+spark-defaults.conf
														
 
															+
														
 
															+```
														
 
															+spark.hadoop.mapreduce.manifest.committer.summary.report.directory file:///tmp/reports
														
 
															+```
														
 
															+
														
 
															+This allows for the statistics of jobs to be collected irrespective of their outcome, Whether or not
														
 
															+saving the `_SUCCESS` marker is enabled, and without problems caused by a chain of queries
														
 
															+overwriting the markers.
														
 
															+
														
 
															+
														
 
															+# <a name="cleanup"></a> Cleanup
														
 
															+
														
 
															+Job cleanup is convoluted as it is designed to address a number of issues which
														
 
															+may surface in cloud storage.
														
 
															+
														
 
															+* Slow performance for deletion of directories.
														
 
															+* Timeout when deleting very deep and wide directory trees.
														
 
															+* General resilience to cleanup issues escalating to job failures.
														
 
															+
														
 
															+
														
 
															+| Option | Meaning | Default Value |
														
 
															+|--------|---------|---------------|
														
 
															+| `mapreduce.fileoutputcommitter.cleanup.skipped` | Skip cleanup of `_temporary` directory| `false` |
														
 
															+| `mapreduce.fileoutputcommitter.cleanup-failures.ignored` | Ignore errors during cleanup | `false` |
														
 
															+| `mapreduce.manifest.committer.cleanup.parallel.delete` | Delete task attempt directories in parallel | `true` |
														
 
															+
														
 
															+The algorithm is:
														
 
															+
														
 
															+```
														
 
															+if `mapreduce.fileoutputcommitter.cleanup.skipped`:
														
 
															+  return
														
 
															+if `mapreduce.manifest.committer.cleanup.parallel.delete`:
														
 
															+  attempt parallel delete of task directories; catch any exception
														
 
															+if not `mapreduce.fileoutputcommitter.cleanup.skipped`:
														
 
															+  delete(`_temporary`); catch any exception
														
 
															+if caught-exception and not `mapreduce.fileoutputcommitter.cleanup-failures.ignored`:
														
 
															+  throw caught-exception
														
 
															+```
														
 
															+
														
 
															+It's a bit complicated, but the goal is to perform a fast/scalable delete and
														
 
															+throw a meaningful exception if that didn't work.
														
 
															+
														
 
															+When working with ABFS and GCS, these settings should normally be left alone.
														
 
															+If somehow errors surface during cleanup, enabling the option to
														
 
															+ignore failures will ensure the job still completes.
														
 
															+Disabling cleanup even avoids the overhead of cleanup, but
														
 
															+requires a workflow or manual operation to clean up all
														
 
															+`_temporary` directories on a regular basis.
														
 
															+
														
 
															+
														
 
															+# <a name="abfs"></a> Working with Azure ADLS Gen2 Storage
														
 
															+
														
 
															+To switch to the manifest committer, the factory for committers for destinations with `abfs://` URLs must
														
 
															+be switched to the manifest committer factory, either for the application or
														
 
															+the entire cluster.
														
 
															+
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>mapreduce.outputcommitter.factory.scheme.abfs</name>
														
 
															+  <value>org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory</value>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+This allows for ADLS Gen2 -specific performance and consistency logic to be used from within the committer.
														
 
															+In particular:
														
 
															+* the [Etag](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag) header
														
 
															+can be collected in listings and used in the job commit phase.
														
 
															+* IO rename operations are rate limited
														
 
															+* recovery is attempted when throttling triggers rename failures.
														
 
															+
														
 
															+*Warning* This committer is not compatible with older Azure storage services
														
 
															+(WASB or ADLS Gen 1).
														
 
															+
														
 
															+The core set of Azure-optimized options becomes
														
 
															+
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>mapreduce.outputcommitter.factory.scheme.abfs</name>
														
 
															+  <value>org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory</value>
														
 
															+</property>
														
 
															+
														
 
															+<property>
														
 
															+  <name>spark.hadoop.fs.azure.io.rate.limit</name>
														
 
															+  <value>10000</value>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+And optional settings for debugging/performance analysis
														
 
															+
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>mapreduce.manifest.committer.summary.report.directory</name>
														
 
															+  <value>abfs:// Path within same store/separate store</value>
														
 
															+  <description>Optional: path to where job summaries are saved</description>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+## <a name="abfs-options"></a> Full set of ABFS options for spark
														
 
															+
														
 
															+```
														
 
															+spark.hadoop.mapreduce.outputcommitter.factory.scheme.abfs org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory
														
 
															+spark.hadoop.fs.azure.io.rate.limit 10000
														
 
															+spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
														
 
															+spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
														
 
															+
														
 
															+spark.hadoop.mapreduce.manifest.committer.summary.report.directory  (optional: URI of a directory for job summaries)
														
 
															+```
														
 
															+
														
 
															+## Experimental: ABFS Rename Rate Limiting `fs.azure.io.rate.limit`
														
 
															+
														
 
															+To avoid triggering store throttling and backoff delays, as well as other
														
 
															+throttling-related failure conditions file renames during job commit
														
 
															+are throttled through a "rate limiter" which limits the number of
														
 
															+rename operations per second a single instance of the ABFS FileSystem client
														
 
															+may issue.
														
 
															+
														
 
															+| Option | Meaning |
														
 
															+|--------|---------|
														
 
															+| `fs.azure.io.rate.limit` | Rate limit in operations/second for IO operations. |
														
 
															+
														
 
															+Set the option to `0` remove all rate limiting.
														
 
															+
														
 
															+The default value of this is set to 10000, which is the default IO capacity for
														
 
															+an ADLS storage account.
														
 
															+
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>fs.azure.io.rate.limit</name>
														
 
															+  <value>10000</value>
														
 
															+  <description>maximum number of renames attempted per second</description>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+This capacity is set at the level of the filesystem client, and so not
														
 
															+shared across all processes within a single application, let
														
 
															+alone other applications sharing the same storage account.
														
 
															+
														
 
															+It will be shared with all jobs being committed by the same
														
 
															+Spark driver, as these do share that filesystem connector.
														
 
															+
														
 
															+If rate limiting is imposed, the statistic `store_io_rate_limited` will
														
 
															+report the time to acquire permits for committing files.
														
 
															+
														
 
															+If server-side throttling took place, signs of this can be seen in
														
 
															+* The store service's logs and their throttling status codes (usually 503 or 500).
														
 
															+* The job statistic `commit_file_rename_recovered`. This statistic indicates that
														
 
															+  ADLS throttling manifested as failures in renames, failures which were recovered
														
 
															+  from in the comitter.
														
 
															+
														
 
															+If these are seen -or other applications running at the same time experience
														
 
															+throttling/throttling-triggered problems, consider reducing the value of
														
 
															+`fs.azure.io.rate.limit`, and/or requesting a higher IO capacity from Microsoft.
														
 
															+
														
 
															+*Important* if you do get extra capacity from Microsoft and you want to use
														
 
															+it to speed up job commits, increase the value of `fs.azure.io.rate.limit`
														
 
															+either across the cluster, or specifically for those jobs which you wish
														
 
															+to allocate extra priority to.
														
 
															+
														
 
															+This is still a work in progress; it may be expanded to support
														
 
															+all IO operations performed by a single filesystem instance.
														
 
															+
														
 
															+# <a name="gcs"></a> Working with Google Cloud Storage
														
 
															+
														
 
															+The manifest committer is compatible with and tested against Google cloud storage through
														
 
															+the gcs-connector library from google, which provides a Hadoop filesystem client for the
														
 
															+schema `gs`.
														
 
															+
														
 
															+Google cloud storage has the semantics needed for the commit protocol
														
 
															+to work safely.
														
 
															+
														
 
															+The Spark settings to switch to this committer are
														
 
															+
														
 
															+```
														
 
															+spark.hadoop.mapreduce.outputcommitter.factory.scheme.gs org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory
														
 
															+spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
														
 
															+spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
														
 
															+
														
 
															+spark.hadoop.mapreduce.manifest.committer.summary.report.directory  (optional: URI of a directory for job summaries)
														
 
															+```
														
 
															+
														
 
															+The store's directory delete operations are `O(files)` so the value
														
 
															+of `mapreduce.manifest.committer.cleanup.parallel.delete`
														
 
															+SHOULD be left at the default of `true`.
														
 
															+
														
 
															+For mapreduce, declare the binding in `core-site.xml`or `mapred-site.xml`
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>mapreduce.outputcommitter.factory.scheme.gcs</name>
														
 
															+  <value>org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory</value>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+
														
 
															+# <a name="hdfs"></a> Working with HDFS
														
 
															+
														
 
															+This committer _does_ work with HDFS, it has just been targeted at object stores with
														
 
															+reduced performance on some operations, especially listing and renaming,
														
 
															+and semantics too reduced for the classic `FileOutputCommitter` to rely on
														
 
															+(specifically GCS).
														
 
															+
														
 
															+To use on HDFS, set the `ManifestCommitterFactory` as the committer factory for `hdfs://` URLs.
														
 
															+
														
 
															+Because HDFS does fast directory deletion, there is no need to parallelize deletion
														
 
															+of task attempt directories during cleanup, so set
														
 
															+`mapreduce.manifest.committer.cleanup.parallel.delete` to `false`
														
 
															+
														
 
															+The final spark bindings becomes
														
 
															+
														
 
															+```
														
 
															+spark.hadoop.mapreduce.outputcommitter.factory.scheme.hdfs org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory
														
 
															+spark.hadoop.mapreduce.manifest.committer.cleanup.parallel.delete false
														
 
															+spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
														
 
															+spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
														
 
															+
														
 
															+spark.hadoop.mapreduce.manifest.committer.summary.report.directory  (optional: URI of a directory for job summaries)
														
 
															+```
														
 
															+
														
 
															+# <a name="advanced"></a> Advanced Topics
														
 
															+
														
 
															+## Advanced Configuration options
														
 
															+
														
 
															+There are some advanced options which are intended for development and testing,
														
 
															+rather than production use.
														
 
															+
														
 
															+| Option | Meaning | Default Value |
														
 
															+|--------|---------|---------------|
														
 
															+| `mapreduce.manifest.committer.store.operations.classname` | Classname for Manifest Store Operations | `""` |
														
 
															+| `mapreduce.manifest.committer.validate.output` | Perform output validation? | `false` |
														
 
															+
														
 
															+
														
 
															+## Validating output  `mapreduce.manifest.committer.validate.output`
														
 
															+
														
 
															+The option `mapreduce.manifest.committer.validate.output` triggers a check of every renamed file to
														
 
															+verify it has the expected length.
														
 
															+
														
 
															+This adds the overhead of a `HEAD` request per file, and so is recommended for testing only.
														
 
															+
														
 
															+There is no verification of the actual contents.
														
 
															+
														
 
															+## Controlling storage integration `mapreduce.manifest.committer.store.operations.classname`
														
 
															+
														
 
															+The manifest committer interacts with filesystems through implementations of the interface
														
 
															+`ManifestStoreOperations`.
														
 
															+It is possible to provide custom implementations for store-specific features.
														
 
															+There is one of these for ABFS; when the abfs-specific committer factory is used this
														
 
															+is automatically set.
														
 
															+
														
 
															+It can be explicitly set.
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>mapreduce.manifest.committer.store.operations.classname</name>
														
 
															+  <value>org.apache.hadoop.fs.azurebfs.commit.AbfsManifestStoreOperations</value>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+The default implementation may also be configured.
														
 
															+
														
 
															+```xml
														
 
															+<property>
														
 
															+  <name>mapreduce.manifest.committer.store.operations.classname</name>
														
 
															+  <value>org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperationsThroughFileSystem</value>
														
 
															+</property>
														
 
															+```
														
 
															+
														
 
															+There is no need to alter these values, except when writing new implementations for other stores,
														
 
															+something which is only needed if the store provides extra integration support for the
														
 
															+committer.
														
 
															+
														
 
															+## <a name="concurrent"></a> Support for concurrent test runs.
														
 
															+
														
 
															+It *may* be possible to run multiple jobs targeting the same directory tree.
														
 
															+
														
 
															+For this to work, a number of conditions must be met:
														
 
															+
														
 
															+* When using spark, unique job IDs must be set. This meangs the Spark distribution
														
 
															+  MUST contain the patches for
														
 
															+  [SPARK-33402](https://issues.apache.org/jira/browse/SPARK-33402)
														
 
															+  and
														
 
															+  [SPARK-33230](https://issues.apache.org/jira/browse/SPARK-33230).
														
 
															+* Cleanup of the `_temporary` directory must be disabled by setting
														
 
															+  `mapreduce.fileoutputcommitter.cleanup.skipped` to `true`.
														
 
															+* All jobs/tasks must create files with unique filenames.
														
 
															+* All jobs must create output with the same directory partition structure.
														
 
															+* Remember to delete the `_temporary` directory later!
														
 
															+
														
 
															+This has *NOT BEEN TESTED*
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_architecture.md
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_architecture.md
@@ -0,0 +1,335 @@
 
															+<!---
														
 
															+  Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+  you may not use this file except in compliance with the License.
														
 
															+  You may obtain a copy of the License at
														
 
															+
														
 
															+   http://www.apache.org/licenses/LICENSE-2.0
														
 
															+
														
 
															+  Unless required by applicable law or agreed to in writing, software
														
 
															+  distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+  See the License for the specific language governing permissions and
														
 
															+  limitations under the License. See accompanying LICENSE file.
														
 
															+-->
														
 
															+
														
 
															+
														
 
															+# Manifest Committer Architecture
														
 
															+
														
 
															+This document describes the architecture and other implementation/correctness
														
 
															+aspects of the [Manifest Committer](manifest_committer.html)
														
 
															+
														
 
															+The protocol and its correctness are covered in [Manifest Committer Protocol](manifest_committer_protocol.html).
														
 
															+<!-- MACRO{toc|fromDepth=0|toDepth=2} -->
														
 
															+
														
 
															+The _Manifest_ committer is a committer for work which provides performance on ABFS for "real world"
														
 
															+queries, and performance and correctness on GCS.
														
 
															+
														
 
															+This committer uses the extension point which came in for the S3A committers.
														
 
															+Users can declare a new committer factory for `abfs://` and `gcs://` URLs.
														
 
															+It can be used through Hadoop MapReduce and Apache Spark.
														
 
															+
														
 
															+## Background
														
 
															+
														
 
															+### Terminology
														
 
															+
														
 
															+| Term | Meaning|
														
 
															+|------|--------|
														
 
															+| Committer |  A class which can be invoked by MR/Spark to perform the task and job commit operations. |
														
 
															+| Spark Driver | The spark process scheduling the work and choreographing the commit operation.|
														
 
															+| Job  | In MapReduce. the entire application. In spark, this is a single stage in a chain of work |
														
 
															+| Job Attempt | A single attempt at a job. MR supports multiple Job attempts with recovery on partial job failure. Spark says "start again from scratch" |
														
 
															+| Task | a subsection of a job, such as processing one file, or one part of a file |
														
 
															+| Task ID |  ID of the task, unique within this job. Usually starts at 0 and is used in filenames (part-0000, part-001, etc.) |
														
 
															+| Task attempt (TA) | An attempt to perform a task. It may fail, in which case MR/spark will schedule another. |
														
 
															+| Task Attempt ID | A unique ID for the task attempt. The Task ID + an attempt counter.|
														
 
															+|  Destination directory | The final destination of work.|
														
 
															+| Job Attempt Directory | A temporary directory used by the job attempt. This is always _underneath_ the destination directory, so as to ensure it is in the same encryption zone as HDFS, storage volume in other filesystems, etc.|
														
 
															+| Task Attempt directory | (also known as "Task Attempt Working Directory"). Directory exclusive for each task attempt under which files are written |
														
 
															+| Task Commit | Taking the output of a Task Attempt and making it the final/exclusive result of that "successful" Task.|
														
 
															+| Job Commit | aggregating all the outputs of all committed tasks and producing the final results of the job. |
														
 
															+
														
 
															+
														
 
															+
														
 
															+The purpose of a committer is to ensure that the complete output of
														
 
															+a job ends up in the destination, even in the presence of failures of tasks.
														
 
															+
														
 
															+- _Complete:_ the output includes the work of all successful tasks.
														
 
															+- _Exclusive:_ the output of unsuccessful tasks is not present.
														
 
															+- _Concurrent:_ When multiple tasks are committed in parallel the output is the same as when
														
 
															+  the task commits are serialized. This is not a requirement of Job Commit.
														
 
															+- _Abortable:_  jobs and tasks may be aborted prior to job commit, after which their output is not visible.
														
 
															+- _Continuity of correctness:_ once a job is committed, the output of any failed,
														
 
															+  aborted, or unsuccessful task MUST NO appear at some point in the future.
														
 
															+
														
 
															+For Hive's classic hierarchical-directory-structured tables, job committing
														
 
															+requires the output of all committed tasks to be put into the correct location
														
 
															+in the directory tree.
														
 
															+
														
 
															+The committer built into `hadoop-mapreduce-client-core` module is the `FileOutputCommitter`.
														
 
															+
														
 
															+
														
 
															+
														
 
															+## The Manifest Committer: A high performance committer for Spark on Azure and Google storage.
														
 
															+
														
 
															+The Manifest Committer is a higher performance committer for ABFS and GCS storage
														
 
															+for jobs which create file across deep directory trees through many tasks.
														
 
															+
														
 
															+It will also work on `hdfs://` and indeed, `file://` URLs, but
														
 
															+it is optimized to address listing and renaming performance and throttling
														
 
															+issues in cloud storage.
														
 
															+
														
 
															+It *will not* work correctly with S3, because it relies on an atomic rename-no-overwrite
														
 
															+operation to commit the manifest file. It will also have the performance
														
 
															+problems of copying rather than moving all the generated data.
														
 
															+
														
 
															+Although it will work with MapReduce
														
 
															+there is no handling of multiple job attempts with recovery from previous failed
														
 
															+attempts.
														
 
															+
														
 
															+### The Manifest
														
 
															+
														
 
															+A Manifest file is designed which contains (along with IOStatistics and some
														
 
															+other things)
														
 
															+
														
 
															+1. A list of destination directories which must be created if they do not exist.
														
 
															+1. A list of files to rename, recorded as (absolute source, absolute destination,
														
 
															+   file-size) entries.
														
 
															+
														
 
															+### Task Commit
														
 
															+
														
 
															+Task attempts are committed by:
														
 
															+
														
 
															+1. Recursively listing the task attempt working dir to build
														
 
															+   1. A list of directories under which files are renamed.
														
 
															+   2. A list of files to rename: source, destination, size and optionally, etag.
														
 
															+2. Saving this information in a manifest file in the job attempt directory with
														
 
															+   a filename derived from the Task ID.
														
 
															+   Note: writing to a temp file and then renaming to the final path will be used
														
 
															+   to ensure the manifest creation is atomic.
														
 
															+
														
 
															+
														
 
															+No renaming takes place —the files are left in their original location.
														
 
															+
														
 
															+The directory treewalk is single-threaded, then it is `O(directories)`,
														
 
															+with each directory listing using one or more paged LIST calls.
														
 
															+
														
 
															+This is simple, and for most tasks, the scan is off the critical path of of the job.
														
 
															+
														
 
															+Statistics analysis may justify moving to parallel scans in future.
														
 
															+
														
 
															+
														
 
															+### Job Commit
														
 
															+
														
 
															+Job Commit consists of:
														
 
															+
														
 
															+1. List all manifest files in the job attempt directory.
														
 
															+1. Load each manifest file, create directories which do not yet exist, then
														
 
															+   rename each file in the rename list.
														
 
															+1. Save a JSON `_SUCCESS` file with the same format as the S3A committer (for
														
 
															+   testing; use write and rename for atomic save)
														
 
															+
														
 
															+The job commit phase supports parallelization for many tasks and many files
														
 
															+per task, specifically:
														
 
															+
														
 
															+1. Manifest tasks are loaded and processed in a pool of "manifest processor"
														
 
															+   threads.
														
 
															+2. Directory creation and file rename operations are each processed in a pool of "
														
 
															+   executor" threads: many renames can execute in parallel as they use minimal
														
 
															+   network IO.
														
 
															+3. job cleanup can parallelize deletion of task attempt directories. This
														
 
															+   is relevant as directory deletion is `O(files)` on Google cloud storage,
														
 
															+   and also on ABFS when OAuth authentication is used.
														
 
															+
														
 
															+
														
 
															+### Ancestor directory preparation
														
 
															+
														
 
															+Optional scan of all ancestors ...if any are files, delete.
														
 
															+
														
 
															+
														
 
															+### Parent directory creation
														
 
															+
														
 
															+1. Probe shared directory map for directory existing. If found: operation is
														
 
															+   complete.
														
 
															+1. if the map is empty, call `getFileStatus()` on the path. Not found: create
														
 
															+   directory, add entry and those of all parent paths Found and is directory:
														
 
															+   add entry and those of all parent paths Found and is file: delete. then
														
 
															+   create as before.
														
 
															+
														
 
															+Efficiently handling concurrent creation of directories (or delete+create) is going to be a
														
 
															+troublespot; some effort is invested there to build the set of directories to
														
 
															+create.
														
 
															+
														
 
															+### File Rename
														
 
															+
														
 
															+Files are renamed in parallel.
														
 
															+
														
 
															+A pre-rename check for anything being at that path (and deleting it) will be optional.
														
 
															+With spark creating new UUIDs for each file, this isn't going to happen, and
														
 
															+saves HTTP requests.
														
 
															+
														
 
															+
														
 
															+### Validation
														
 
															+
														
 
															+Optional scan of all committed files and verify length and, if known,
														
 
															+etag. For testing and diagnostics.
														
 
															+
														
 
															+## Benefits
														
 
															+
														
 
															+* Pushes the source tree list operations into the task commit phase, which is
														
 
															+  generally off the critical path of execution
														
 
															+* Provides an atomic task commit to GCS, as there is no expectation that
														
 
															+  directory rename is atomic
														
 
															+* It is possible to pass IOStatistics from workers in manifest.
														
 
															+* Allows for some pre-rename operations similar to the S3A "Partitioned Staging
														
 
															+  committer". This can be configured to delete all existing entries in
														
 
															+  directories scheduled to be created -or fail if those partitions are
														
 
															+  non-empty.
														
 
															+  See [Partitioned Staging Committer](../../hadoop-aws/tools/hadoop-aws/committers.html#The_.E2.80.9CPartitioned.E2.80.9D_Staging_Committer)
														
 
															+* Allows for an optional preflight validation check (verify no duplicate files created by different tasks)
														
 
															+* Manifests can be viewed, size of output determined, etc, during
														
 
															+  development/debugging.
														
 
															+
														
 
															+### Disadvantages
														
 
															+
														
 
															+* Needs a new manifest file format.
														
 
															+* May makes task commit more complex.
														
 
															+
														
 
															+This solution is necessary for GCS and should be beneficial on ABFS as listing
														
 
															+overheads are paid for in the task committers.
														
 
															+
														
 
															+# Implementation Details
														
 
															+
														
 
															+### Constraints
														
 
															+
														
 
															+A key goal is to keep the manifest committer isolated and neither
														
 
															+touch the existing committer code nor other parts of the hadoop codebase.
														
 
															+
														
 
															+It must plug directly into MR and Spark without needing any changes
														
 
															+other than already implemented for the S3A Committers
														
 
															+
														
 
															+* Self-contained: MUST NOT require changes to hadoop-common, etc.
														
 
															+* Isolated: MUST NOT make changes to existing committers
														
 
															+* Integrated: MUST bind via `PathOutputCommitterFactory`.
														
 
															+
														
 
															+As a result of this there's a bit of copy and paste from elsewhere,
														
 
															+e.g. `org.apache.hadoop.util.functional.TaskPool`
														
 
															+is based on S3ACommitter's `org.apache.hadoop.fs.s3a.commit.Tasks`.
														
 
															+
														
 
															+The` _SUCCESS` file MUST be compatible with the S3A JSON file.
														
 
															+This is to ensure any existing test suites which validate
														
 
															+S3A committer output can be retargeted at jobs executed
														
 
															+by the manifest committer without any changes.
														
 
															+
														
 
															+
														
 
															+#### Progress callbacks in job commit.
														
 
															+
														
 
															+When? Proposed: heartbeat until renaming finally finishes.
														
 
															+
														
 
															+#### Error handling and aborting in job commit.
														
 
															+
														
 
															+We would want to stop the entire job commit. Some atomic boolean "abort job"
														
 
															+would need to be checked in the processing of each task committer thread's
														
 
															+iteraton through a directory (or processing of each file?)
														
 
															+Failures in listing or renaming will need to be escalated to halting the entire
														
 
															+job commit. This implies that any IOE raised in asynchronous rename operation or
														
 
															+in a task committer thread must:
														
 
															+
														
 
															+1. be caught
														
 
															+1. be stored in a shared field/variable
														
 
															+1. trigger the abort
														
 
															+1. be rethrown at the end of the `commitJob()` call
														
 
															+
														
 
															+#### Avoiding deadlocks
														
 
															+
														
 
															+If a job commit stage is using a thread pool for per-task operations, e.g. loading
														
 
															+files, that same thread pool MUST NOT be used for parallel operations within
														
 
															+the per-task stage.
														
 
															+
														
 
															+As every `JobStage` is executed in sequence within task or job commit, it
														
 
															+is safe to share the same thread pool across stages.
														
 
															+
														
 
															+In the current implementation, there is no parallel "per manifest" operation
														
 
															+in job commit other than for actually loading the files.
														
 
															+The operations to create directories and to rename files are actually
														
 
															+performed without performing parallel processing of individual manifests.
														
 
															+
														
 
															+Directory Preparation: merge the directory lists of all manifests,
														
 
															+then queue for creation the (hopefully very much smaller) set of unique
														
 
															+directories.
														
 
															+
														
 
															+Rename: iterate through all manifests and queue their renames into a pool for
														
 
															+renaming.
														
 
															+
														
 
															+#### Thread pool lifetimes
														
 
															+
														
 
															+The lifespan of thread pools is constrained to that of the stage configuration,
														
 
															+which will be limited to within each of the `PathOutputCommitter` methods
														
 
															+to setup, commit, abort and cleanup.
														
 
															+
														
 
															+This avoids the thread pool lifecycle problems of the S3A Committers.
														
 
															+
														
 
															+#### Scale issues similar to S3A HADOOP-16570.
														
 
															+
														
 
															+This was a failure in terasorting where many tasks each generated many files;
														
 
															+the full list of files to commit (and the etag of every block) was built up in memory and
														
 
															+validated prior to execution.
														
 
															+
														
 
															+The manifest committer assumes that the amount of data being stored in memory is less,
														
 
															+because there is no longer the need to store an etag for every block of every
														
 
															+file being committed.
														
 
															+
														
 
															+
														
 
															+#### Duplicate creation of directories in the dest dir
														
 
															+
														
 
															+Combine all lists of directories to create and eliminate duplicates.
														
 
															+
														
 
															+## Implementation Architecture
														
 
															+
														
 
															+The implementation architecture reflects lessons from the S3A Connector.
														
 
															+
														
 
															+* Isolate the commit stages from the MR commit class, as that's got a complex lifecycle.
														
 
															+* Instead, break up into series of _stages_ which can be tested in isolation
														
 
															+  and chained to provided the final protocol.
														
 
															+* Don't pass in MR data types (taskID etc) down into the stages -pass down a configuration
														
 
															+  with general types (string etc).
														
 
															+* Also pass in a callback for store operations, for ease of implementing a fake store.
														
 
															+* For each stage: define preconditions and postconditions, failure modes. Test in isolation.
														
 
															+
														
 
															+#### Statistics
														
 
															+
														
 
															+The committer collects duration statistics on all the operations it performs/invokes
														
 
															+against filesystems.
														
 
															+* Those collected during task commit are saved to the manifest (excluding the time to
														
 
															+save and rename that file)
														
 
															+* When these manifests are loaded during job commit, these statistics are merged to
														
 
															+form aggregate statistics of the whole job.
														
 
															+* Which are saved to the `_SUCCESS` file
														
 
															+* and to any copy of that file in the directory specified by
														
 
															+  `mapreduce.manifest.committer.summary.report.directory`, if set.
														
 
															+  to be saved.
														
 
															+* The class `org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestPrinter`
														
 
															+  can load and print these.
														
 
															+
														
 
															+IO statistics from filsystems and input and output streams used in a query are not
														
 
															+collected.
														
 
															+
														
 
															+
														
 
															+## Auditing
														
 
															+
														
 
															+When invoking the `ManifestCommitter` via the `PathOutputCommitter` API, the following
														
 
															+attributes are added to the active (thread) context
														
 
															+
														
 
															+| Key   | Value           |
														
 
															+|-------|-----------------|
														
 
															+| `ji`  | Job ID          |
														
 
															+| `tai` | Task Attempt ID |
														
 
															+| `st`  | Stage           |
														
 
															+
														
 
															+These are also all set in all the helper threads performing work
														
 
															+as part of a stage's execution.
														
 
															+
														
 
															+Any store/FS which supports auditing is able to collect this data
														
 
															+and include in their logs.
														
 
															+
														
 
															+To ease backporting, all audit integration is in the single class
														
 
															+`org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration`.
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_protocol.md
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_protocol.md
@@ -0,0 +1,617 @@
 
															+<!---
														
 
															+  Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+  you may not use this file except in compliance with the License.
														
 
															+  You may obtain a copy of the License at
														
 
															+
														
 
															+   http://www.apache.org/licenses/LICENSE-2.0
														
 
															+
														
 
															+  Unless required by applicable law or agreed to in writing, software
														
 
															+  distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+  See the License for the specific language governing permissions and
														
 
															+  limitations under the License. See accompanying LICENSE file.
														
 
															+-->
														
 
															+
														
 
															+
														
 
															+# Manifest Committer Protocol
														
 
															+
														
 
															+This document describes the commit protocol
														
 
															+ of the [Manifest Committer](manifest_committer.html)
														
 
															+
														
 
															+<!-- MACRO{toc|fromDepth=0|toDepth=2} -->
														
 
															+
														
 
															+## Background
														
 
															+
														
 
															+### Terminology
														
 
															+
														
 
															+| Term | Meaning|
														
 
															+|------|--------|
														
 
															+| Committer |  A class which can be invoked by MR Spark to perform the task and job commit operations. |
														
 
															+| Spark Driver | The spark process scheduling the work and choreographing the commit operation.|
														
 
															+| Job: in MapReduce | The entire application. In spark, this is a single stage in a chain of work |
														
 
															+| Job Attempt | A single attempt at a job. MR supports multiple Job attempts with recovery on partial job failure. Spark says "start again from scratch" |
														
 
															+| Task | a subsection of a job, such as processing one file, or one part of a file |
														
 
															+| Task ID |  ID of the task, unique within this job. Usually starts at 0 and is used in filenames (part-0000, part-001, etc.) |
														
 
															+| Task attempt (TA) | An attempt to perform a task. It may fail, in which case MR/spark will schedule another. |
														
 
															+| Task Attempt ID | A unique ID for the task attempt. The Task ID + an attempt counter.|
														
 
															+|  Destination directory | The final destination of work.|
														
 
															+| Job Attempt Directory | A temporary directory used by the job attempt. This is always _underneath_ the destination directory, so as to ensure it is in the same encryption zone as HDFS, storage volume in other filesystems, etc.|
														
 
															+| Task Attempt directory | Directory under the Job Attempt Directory where task attempts create subdiretories for their own work |
														
 
															+| Task Attempt Working Directory| Directory exclusive for each task attempt under which files are written  |
														
 
															+| Task Commit | Taking the output of a Task Attempt and making it the final/exclusive result of that "successful" Task.  |
														
 
															+| Job Commit | aggregating all the outputs of all committed tasks and producing the final results of the job. |
														
 
															+
														
 
															+
														
 
															+
														
 
															+The purpose of a committer is to ensure that the complete output of
														
 
															+a job ends up in the destination, even in the presence of failures of tasks.
														
 
															+
														
 
															+- _Complete:_ the output includes the work of all successful tasks.
														
 
															+- _Exclusive:_ the output of unsuccessful tasks is not present.
														
 
															+- _Concurrent:_ When multiple tasks are committed in parallel the output is the same as when
														
 
															+  the task commits are serialized. This is not a requirement of Job Commit.
														
 
															+- _Abortable:_  jobs and tasks may be aborted prior to job commit, after which their output is not visible.
														
 
															+- _Continuity of correctness:_ once a job is committed, the output of any failed,
														
 
															+  aborted, or unsuccessful task MUST NO appear at some point in the future.
														
 
															+
														
 
															+For Hive's classic hierarchical-directory-structured tables, job committing
														
 
															+requires the output of all committed tasks to be put into the correct location
														
 
															+in the directory tree.
														
 
															+
														
 
															+The committer built into `hadoop-mapreduce-client-core` module is the `FileOutputCommitter`.
														
 
															+
														
 
															+
														
 
															+It has two algorithms, v1 and v2.
														
 
															+
														
 
															+The v1 algorithm is resilient to all forms of task failure, but slow
														
 
															+when committing the final aggregate output as it renames each newly created file
														
 
															+to the correct place in the table one by one.
														
 
															+
														
 
															+The v2 algorithm is not considered safe because the output is visible when individual
														
 
															+tasks commit, rather than being delayed until job commit.
														
 
															+It is possible for multiple task attempts to get their data into the output
														
 
															+directory tree, and if a job fails/is aborted before the job is committed,
														
 
															+thie output is visible.
														
 
															+
														
 
															+## File Output Committer V1 and V2
														
 
															+
														
 
															+### File Output Committer V1 and V2 Commit algorithms
														
 
															+
														
 
															+#### Task attempt execution (V1 and V2)
														
 
															+
														
 
															+job attempt directory in `$dest/__temporary/$jobAttemptId/` contains all output
														
 
															+of the job in progress every task attempt is allocated its own task attempt dir
														
 
															+`$dest/__temporary/$jobAttemptId/__temporary/$taskAttemptId`
														
 
															+
														
 
															+All work for a task is written under the task attempt directory. If the output
														
 
															+is a deep tree with files at the root, the task attempt dir will end up with a
														
 
															+similar structure, with the files it has generated and the directories above
														
 
															+them.
														
 
															+
														
 
															+### MapReduce V1 algorithm:
														
 
															+
														
 
															+#### v1 Task commit
														
 
															+
														
 
															+The task attempt dir is renamed directly underneath the job attempt dir
														
 
															+
														
 
															+```
														
 
															+rename(
														
 
															+  $dest/__temporary/$jobAttemptId/__temporary/$taskAttemptId
														
 
															+  $dest/__temporary/$jobAttemptId/$taskId)
														
 
															+```
														
 
															+
														
 
															+#### V1 Job Commit
														
 
															+
														
 
															+For each committed task, all files underneath are renamed into the destination
														
 
															+directory, with a filename relative from the base directory of the task remapped
														
 
															+to that of the dest dir.
														
 
															+
														
 
															+That is, everything under `$dest/__temporary/$jobAttemptId/$taskId` is converted
														
 
															+to a path under `$dest`.
														
 
															+
														
 
															+A recursive treewalk identifies the paths to rename in each TA directory.
														
 
															+There's some optimisation if the task directory tree contains a subdirectory
														
 
															+directory which does not exist under the destination: in this case the whole
														
 
															+directory can be renamed. If the directory already exists, a file-by-file merge
														
 
															+takes place for that dir, with the action for subdirectories again depending on
														
 
															+the presence of the destination.
														
 
															+
														
 
															+As a result, if the output of each task goes a separate final directory (e.g the
														
 
															+final partition is unique to a single task), the rename is O(1) for the dir,
														
 
															+irrespective of children. If the output is to be in the same dir as other
														
 
															+tasks (or updating existing directories), then the rename performance becomes O(
														
 
															+files).
														
 
															+
														
 
															+Finally, a 0-byte `_SUCCESS` file is written iff `mapreduce.fileoutputcommitter.marksuccessfuljobs` is true.
														
 
															+
														
 
															+### MapReduce V2 algorithm:
														
 
															+
														
 
															+#### V2 Task commit
														
 
															+
														
 
															+The files under the task attempt dir are renamed one by one into the destination
														
 
															+directory. There's no attempt at optimising directory renaming, because other
														
 
															+tasks may be committing their work at the same time. It is therefore `O(files)` +
														
 
															+the cost of listing the directory tree. Again: done with a recursive treewalk,
														
 
															+not a deep `listFiles(path, recursive=true)` API, which would be faster on HDFS
														
 
															+and (though not relevant here) S3.
														
 
															+
														
 
															+#### V2 Job Commit
														
 
															+
														
 
															+A 0-byte `_SUCCESS` file is written iff `mapreduce.fileoutputcommitter.marksuccessfuljobs`
														
 
															+is true.
														
 
															+
														
 
															+
														
 
															+
														
 
															+### Why the V2 committer is incorrect/unsafe
														
 
															+
														
 
															+If, for a Task T1, Task Attempt 1 (T1A1) fails before committing, the driver
														
 
															+will schedule a new attempt "T1A2", and commit it. All is good.
														
 
															+
														
 
															+But: if T1A1 was given permission to commit and it failed during the commit
														
 
															+process, some of its output may have been written to the destination directory.
														
 
															+
														
 
															+If attempt T1A2 was then told to commit, then if and only if its output had the
														
 
															+exact set of file names would any already-renamed files be overwritten. If
														
 
															+different filenames were generated, then the output would contain files of T1A1
														
 
															+and T1A2.
														
 
															+
														
 
															+If T1A1 became partitioned during the commit process, then the job committer
														
 
															+would schedule another attempt and commit its work. However, if T1A1 still had
														
 
															+connectivity to the filesystem, it could still be renaming files. The output of
														
 
															+the two tasks could be intermingled even if the same filenames were used.
														
 
															+
														
 
															+## Background: the S3A Committers
														
 
															+
														
 
															+The paper, [_A Zero-Rename Committer_](https://github.com/steveloughran/zero-rename-committer/releases/),
														
 
															+Loughran et. al., covers these committers
														
 
															+
														
 
															+It also describes the commit problem, defines correctness, and describes the
														
 
															+algorithms of the v1 and v2 committers, as well as those of the S3A committers,
														
 
															+IBM Stocator committer and what we know of EMR's Spark committer.
														
 
															+
														
 
															+The `hadoop-aws` JAR contains a pair of committers, "Staging" and "Magic". Both
														
 
															+of these are implementations of the same problem: safely and rapidly committing
														
 
															+work to an S3 object store.
														
 
															+
														
 
															+The committers take advantage of the fact that S3 offers an atomic way to create
														
 
															+a file: the PUT request.
														
 
															+
														
 
															+Files either exist or they don't. A file can be uploaded direct to its
														
 
															+destination, and it is only when the upload completes that the file is manifest
														
 
															+-overwriting any existing copy.
														
 
															+
														
 
															+For large files, a multipart upload allows this upload operation to be split
														
 
															+into a series of POST requests
														
 
															+
														
 
															+1 `initiate-upload (path -> upload ID)`
														
 
															+1. `upload part(path, upload ID, data[]) -> checksum.`
														
 
															+   This can be parallelised. Up to 10,000 parts can be uploaded to a single
														
 
															+   object. All but the final part must be >= 5MB.
														
 
															+1. `complete-upload (path, upload ID, List<checksum>)`
														
 
															+   this manifests the file, building it from the parts in the sequence of blocks
														
 
															+   defined by the ordering of the checksums.
														
 
															+
														
 
															+The secret for the S3A committers is that the final POST request can be delayed
														
 
															+until the job commit phase, even though the files are uploaded during task
														
 
															+attempt execution/commit. The task attempts need to determine the final
														
 
															+destination of each file, upload the data as part of a multipart operation, then
														
 
															+save the information needed to complete the upload in a file which is later read
														
 
															+by the job committer and used in a POST request.
														
 
															+
														
 
															+### Staging Committer
														
 
															+
														
 
															+The _Staging Committer_ is based on the contribution by Ryan Blue of Netflix.
														
 
															+it  relies on HDFS to be the consistent store to propagate the `.pendingset` files.
														
 
															+
														
 
															+The working directory of each task attempt is in the local filesystem, "the
														
 
															+staging directory". The information needed to complete the uploads is passed
														
 
															+from Task Attempts to the Job Committer by using a v1 FileOutputCommitter
														
 
															+working with the cluster HDFS filesystem. This ensures that the committer has
														
 
															+the same correctness guarantees as the v1 algorithm.
														
 
															+
														
 
															+1. Task commit consists of uploading all files under the local filesystem's task
														
 
															+   attempt working directory to their final destination path, holding back on
														
 
															+   the final manifestation POST.
														
 
															+1. A JSON file containing all information needed to complete the upload of all
														
 
															+   files in the task attempt is written to the Job Attempt directory of of the
														
 
															+   wrapped committer working with HDFS.
														
 
															+1. Job commit: load in all the manifest files in the HDFS job attempt directory,
														
 
															+   then issued the POST request to complete the uploads. These are parallelised.
														
 
															+
														
 
															+
														
 
															+### The Magic Committer
														
 
															+
														
 
															+The _Magic Committer_ is purely-S3A and takes advantage and of
														
 
															+the fact the authorts could make changes within the file system client itself.
														
 
															+
														
 
															+"Magic" paths are defined which, when opened for writing under, initiate a
														
 
															+multi-party upload to the final destination directory. When the output stream is
														
 
															+`close()`d, a zero byte marker file is written to the magic path, and a JSON
														
 
															+.pending file containing all the information needed to complete the upload is
														
 
															+saved.
														
 
															+
														
 
															+Task commit:
														
 
															+1. List all `.pending` files under each task attempt's magic directory;
														
 
															+1. Aggregate to a `.pendingset` file
														
 
															+1. Save to the job attempt directory with the task ID.
														
 
															+
														
 
															+Job commit:
														
 
															+
														
 
															+1. List `.pendingset` files in the job attempt directory
														
 
															+1. Complete the uploads with POST requests.
														
 
															+
														
 
															+The Magic committer absolutely requires a consistent S3 Store -originally with
														
 
															+S3Guard. Now that S3 is consistent, raw S3 can be used. It does not need HDFS
														
 
															+or any other filesystem with `rename()`.
														
 
															+
														
 
															+### Correctness
														
 
															+
														
 
															+The S3A committer is considered correct because
														
 
															+
														
 
															+1. Nothing is materialized until job commit.
														
 
															+1. Only one task attempt's manifest can be saved to the job attempt directory.
														
 
															+   Hence: only of the TA's files of the same task ID are exclusively committed.
														
 
															+1. The staging committer's use of HDFS to pass manifests from TAs to the Job
														
 
															+   committer ensures that S3's eventual consistency would not cause manifests to
														
 
															+   be missed.
														
 
															+1. Until S3 was consistent, the magic committer relied on S3Guard to provide the
														
 
															+   list consistency needed during both task- and job- commit.
														
 
															+1. The authors and wider community fixed all the issues related to the committers
														
 
															+   which have surfaced in production.
														
 
															+
														
 
															+Significant issues which were fixed include:
														
 
															+
														
 
															+* [HADOOP-15961](https://issues.apache.org/jira/browse/HADOOP-15961).
														
 
															+  S3A committers: make sure there's regular progress() calls.
														
 
															+* [HADOOP-16570](https://issues.apache.org/jira/browse/HADOOP-16570).
														
 
															+  S3A committers encounter scale issues.
														
 
															+* [HADOOP-16798](https://issues.apache.org/jira/browse/HADOOP-16798).
														
 
															+  S3A Committer thread pool shutdown problems.
														
 
															+* [HADOOP-17112](https://issues.apache.org/jira/browse/HADOOP-17112).
														
 
															+  S3A committers can't handle whitespace in paths.
														
 
															+* [HADOOP-17318](https://issues.apache.org/jira/browse/HADOOP-17318).
														
 
															+  Support concurrent S3A commit jobs with same app attempt ID.
														
 
															+* [HADOOP-17258](https://issues.apache.org/jira/browse/HADOOP-17258).
														
 
															+  MagicS3GuardCommitter fails with `pendingset` already exists
														
 
															+* [HADOOP-17414](https://issues.apache.org/jira/browse/HADOOP-17414]).
														
 
															+  Magic committer files don't have the count of bytes written collected by spark
														
 
															+* [SPARK-33230](https://issues.apache.org/jira/browse/SPARK-33230)
														
 
															+  Hadoop committers to get unique job ID in `spark.sql.sources.writeJobUUID`
														
 
															+* [SPARK-33402](https://issues.apache.org/jira/browse/SPARK-33402)
														
 
															+  Jobs launched in same second have duplicate MapReduce JobIDs
														
 
															+* [SPARK-33739](https://issues.apache.org/jira/browse/SPARK-33739]).
														
 
															+  Jobs committed through the S3A Magic committer don't report
														
 
															+  the bytes written (depends on HADOOP-17414)
														
 
															+
														
 
															+Of those which affected the correctness rather than scale/performance/UX:
														
 
															+HADOOP-17258 involved the recovery from a failure after TA1 task commit had
														
 
															+completed —but had failed to report in. SPARK-33402, SPARK-33230 and
														
 
															+HADOOP-17318 are all related: if two spark jobs/stages started in the
														
 
															+same second, they had the same job ID. This caused the HDFS directories used by
														
 
															+the staging committers to be intermingled.
														
 
															+
														
 
															+What is notable is this: these are all problems which the minimal integration
														
 
															+test suites did not discover.
														
 
															+
														
 
															+The good news: we now know of these issues and are better placed to avoid
														
 
															+replicating them again. And know what to write tests for.
														
 
															+
														
 
															+## The V1 committer: slow in Azure and slow and unsafe on GCS.
														
 
															+
														
 
															+The V1 committer underperforms on ABFS because:
														
 
															+
														
 
															+1. Directory listing and file renaming is somewhat slower with ABFS than it is
														
 
															+   with HDFS.
														
 
															+1. The v1 committer sequentially commits the output of each task through a
														
 
															+   listing of each committed task's output, moving directories when none exist
														
 
															+   in the destination, merging files into extant directories.
														
 
															+
														
 
															+The V2 committer is much faster in the job commit because it performs the list
														
 
															+and rename process in the task commit. Which, because it is non-atomic, is why
														
 
															+it is considered dangerous to use. What the V2 task commit algorithm does show is
														
 
															+that it is possible to parallelise committing the output of different tasks by
														
 
															+using file-by-file rename exclusively.
														
 
															+
														
 
															+The V1 committer underperforms on GCS because even the task commit operation,
														
 
															+—directory rename—, is a non-atomic `O(files)` operation.
														
 
															+This also means that it is unsafe.
														
 
															+
														
 
															+If the task attempt has partitioned and the spark driver schedules/commits another TA, then,
														
 
															+the task dir may contain 1+ file from the first attempt.
														
 
															+
														
 
															+----------------------------------------------------------------------------------------
														
 
															+
														
 
															+# The Manifest Committer Protocol
														
 
															+
														
 
															+## Requirements of the Store
														
 
															+
														
 
															+Stores/filesystems supported by this committer MUST:
														
 
															+
														
 
															+* Have consistent listings.
														
 
															+* Have an atomic `O(1)` file rename operation.
														
 
															+
														
 
															+Stores/filesystems supported by this committer SHOULD:
														
 
															+
														
 
															+* Rename files successfully, even under load. ABFS does not do this,
														
 
															+  so special recovery is provided there.
														
 
															+* Implement the `EtagSource` interface of HADOOP-17979.
														
 
															+  This is used for ABFS rename recovery, and for optional
														
 
															+  validation of the final output.
														
 
															+
														
 
															+Stores/filesystems supported by this committer MAY:
														
 
															+
														
 
															+* Have list operations with high latency.
														
 
															+* Reject calls under load with throttling responses,
														
 
															+  which MUST be handled in the filesystem connector.
														
 
															+
														
 
															+Stores/filesystems supported by this committer MAY NOT:
														
 
															+
														
 
															+* Support atomic directory rename. This is never used except optionally in cleanup.
														
 
															+* Support `O(1)` directory deletion. The `CleanupJobStage` assumes this is not
														
 
															+  the case and so deletes task attempt directories in parallel.
														
 
															+* Support an atomic `create(Path, overwrite=false)` operation.
														
 
															+  The manifests are committed by writing to a path including the task attempt ID,
														
 
															+  then renamed to their final path.
														
 
															+* Support fast `listFiles(path, recursive=true)` calls.
														
 
															+  This API call is not used.
														
 
															+
														
 
															+When compared with the `FileOutputCommitter`, the requirements
														
 
															+which have been removed are:
														
 
															+
														
 
															+* Atomic directory rename.
														
 
															+* `O(1)` directory deletion.
														
 
															+* Fast directory listings.
														
 
															+* The implicit absence of throttling behaviors.
														
 
															+
														
 
															+HDFS meets all those requirements, so does not benefit significantly from
														
 
															+this committer, though it will still work there.
														
 
															+
														
 
															+The S3 store does not meet the rename requirements of this committer,
														
 
															+even now that it is consistent.
														
 
															+This committer is not safe to use on S3.
														
 
															+
														
 
															+### Task and Job IDs
														
 
															+
														
 
															+Every job MUST have a unique ID.
														
 
															+
														
 
															+The implementation expects the Spark runtime to have the relevant patches to
														
 
															+ensure this.
														
 
															+
														
 
															+The job ID is used to name temporary directories, rather than using the classic
														
 
															+incrementing natural numbering scheme of `_temporary/0/`.
														
 
															+That scheme comes from MapReduce where job attempts of attempt ID &gt; 1
														
 
															+look for tasks committed by predecessors and incorporate that into their
														
 
															+results.
														
 
															+
														
 
															+This committer targets Spark, where there is no attempt at recovery.
														
 
															+By using the job ID in paths, if jobs are configured to _not_ delete
														
 
															+all of `_temporary` in job cleanup/abort, then multiple jobs
														
 
															+MAY be executed using the same table as their destination.
														
 
															+
														
 
															+Task IDs and Task Attempt IDs will be derived from Job IDs as usual.
														
 
															+
														
 
															+It is expected that filenames of written files SHALL be unique.
														
 
															+This is done in Spark for ORC and Parquet files, and allows for
														
 
															+checks for destination files to be omitted by default.
														
 
															+
														
 
															+
														
 
															+## Directory Structure
														
 
															+
														
 
															+Given a destination directory `destDir: Path`
														
 
															+
														
 
															+A job of id `jobID: String` and attempt number `jobAttemptNumber:int`
														
 
															+will use the directory:
														
 
															+
														
 
															+```
														
 
															+$destDir/_temporary/manifest_$jobID/$jobAttemptNumber/
														
 
															+```
														
 
															+
														
 
															+For its work (note: it will actually format that final subdir with `%02d`).
														
 
															+
														
 
															+This is termed the _Job Attempt Directory_
														
 
															+
														
 
															+Under the Job Attempt Directory, a subdirectory `tasks` is
														
 
															+created. This is termed the _Task Attempt Directory_.
														
 
															+Every task attempt will have its own subdirectory of this,
														
 
															+into which its work will be saved.
														
 
															+
														
 
															+Under the Job Attempt Directory, a subdirectory `manifests` is created.
														
 
															+This is termed the _y_.
														
 
															+
														
 
															+The manifests of all committed tasks will be saved to this
														
 
															+directory with the filename of
														
 
															+`$taskId-manifest.json`
														
 
															+
														
 
															+The full path
														
 
															+
														
 
															+```
														
 
															+$destDir/_temporary/manifest_$jobID/$jobAttemptNumber/manifests/$taskId-manifest.json
														
 
															+```
														
 
															+
														
 
															+Is the final location for the manifest of all files created by
														
 
															+a committed task. It is termed the _Manifest Path of a Committed Task_.
														
 
															+
														
 
															+Task attempts will save their manifest into this directory with
														
 
															+a temporary filename
														
 
															+`$taskAttemptId-manifest.json.tmp`.
														
 
															+
														
 
															+This is termed the _Temporary Path of a Task Attempt's Manifest_.
														
 
															+
														
 
															+For the job and task operations then, the following paths are
														
 
															+defined.
														
 
															+```
														
 
															+let jobDirectory = "$destDir/_temporary/manifest_$jobID/"
														
 
															+let jobAttemptDirectory = jobDirectory + "$jobAttemptNumber/"
														
 
															+let manifestDirectory = jobAttemptDirectory + "manifests/"
														
 
															+let taskAttemptDirectory = jobAttemptDirectory + "tasks/"
														
 
															+```
														
 
															+
														
 
															+And for each task attempt, the following paths are also defined
														
 
															+
														
 
															+```
														
 
															+let taskAttemptWorkingDirectory = taskAttemptDirectory + "$taskAttemptId"
														
 
															+let taskManifestPath = manifestDirectory + "$taskId-manifest.json"
														
 
															+let taskAttemptTemporaryManifestPath = manifestDirectory + "$taskAttemptId-manifest.json"
														
 
															+```
														
 
															+
														
 
															+## Core Algorithm of the Protocol
														
 
															+
														
 
															+1. Each Task attempt writes all its files to a unique directory tree under the
														
 
															+   Task Attempt Directory.
														
 
															+2. Task Commit consists of a recursive scan of the directory for that task attempt,
														
 
															+   creating a list of directories and a list of files.
														
 
															+3. These lists are saved as a JSON manifest file.
														
 
															+4. Job commit consists of listing all of the JSON manifest files,
														
 
															+   loading their contents, creating the aggregate set of destination directories
														
 
															+   and renaming all files into their final destinations.
														
 
															+
														
 
															+
														
 
															+### The Intermediate Manifest
														
 
															+
														
 
															+This is JSON file is designed which contains (along with IOStatistics and some diagnostics)
														
 
															+
														
 
															+1. A list of destination directories which must be created if they do not exist.
														
 
															+1. A list of files to rename as (absolute source, absolute destination,
														
 
															+   file-size) entries.
														
 
															+
														
 
															+### Job Setup
														
 
															+
														
 
															+```
														
 
															+mkdir(jobAttemptDirectory)
														
 
															+mkdir(manifestDirectory)
														
 
															+mkdir(taskAttemptDirectory)
														
 
															+```
														
 
															+
														
 
															+### Task Setup
														
 
															+
														
 
															+```
														
 
															+mkdir(taskAttemptWorkingDirectory)
														
 
															+```
														
 
															+
														
 
															+### Task Commit
														
 
															+
														
 
															+Task attempts are committed by:
														
 
															+
														
 
															+1. Recursively listing the task attempt working dir to build
														
 
															+   1. A list of destination directories under which files will be renamed,
														
 
															+      and their status (exists, not_found, file)
														
 
															+   2. A list of files to rename: source, destination, size and optionally, etag.
														
 
															+2. These lists populate a JSON file, the _Intermediate Manifest_.
														
 
															+3. The task attempt saves this file to its _Temporary Path of a Task Attempt's
														
 
															+   Manifest_.
														
 
															+4. The task attempt then deletes the _Manifest Path of a Committed Task_ and
														
 
															+   renames its own manifest file to that path.
														
 
															+5. If the rename succeeeds, the task commit is considered a success.
														
 
															+
														
 
															+No renaming takes place at this point.: the files are left in their original location until
														
 
															+renamed in job commit.
														
 
															+
														
 
															+```
														
 
															+let (renames, directories) = scan(taskAttemptWorkingDirectory)
														
 
															+let manifest = new Manifest(renames, directories)
														
 
															+
														
 
															+manifest.save(taskAttemptTemporaryManifestPath)
														
 
															+rename(taskAttemptTemporaryManifestPath, taskManifestPath)
														
 
															+```
														
 
															+
														
 
															+### Task Abort/cleanup
														
 
															+
														
 
															+```
														
 
															+delete(taskAttemptWorkingDirectory)
														
 
															+```
														
 
															+
														
 
															+### Job Commit
														
 
															+
														
 
															+Job Commit consists of:
														
 
															+
														
 
															+1. List all manifest files in the job attempt directory.
														
 
															+2. Load each manifest file, create directories which do not yet exist, then
														
 
															+   rename each file in the rename list.
														
 
															+3. Optionally save a JSON `_SUCCESS` file with the same format as the S3A committer (for
														
 
															+   testing; use write and rename for atomic save)
														
 
															+
														
 
															+The job commit phase supports parallelization for many tasks and many files
														
 
															+per task, specifically there is a thread pool for parallel store IO
														
 
															+
														
 
															+1. Manifest tasks are loaded and processed in parallel.
														
 
															+1. Deletion of files where directories are intended to be created.
														
 
															+1. Creation of leaf directories.
														
 
															+1. File rename.
														
 
															+1. In cleanup and abort: deletion of task attempt directories
														
 
															+1. If validation of output is enabled for testing/debugging: getFileStatus calls
														
 
															+   to compare file length and, if possible etags.
														
 
															+
														
 
															+```
														
 
															+let manifestPaths = list("$manifestDirectory/*-manifest.json")
														
 
															+let manifests = manifestPaths.map(p -> loadManifest(p))
														
 
															+let directoriesToCreate = merge(manifests.directories)
														
 
															+let filesToRename = concat(manifests.files)
														
 
															+
														
 
															+directoriesToCreate.map(p -> mkdirs(p))
														
 
															+filesToRename.map((src, dest, etag) -> rename(src, dest, etag))
														
 
															+
														
 
															+if mapreduce.fileoutputcommitter.marksuccessfuljobs then
														
 
															+  success.save("$destDir/_SUCCESS")
														
 
															+
														
 
															+```
														
 
															+
														
 
															+Implementation Note:
														
 
															+
														
 
															+To aid debugging and development, the summary be saved to a location
														
 
															+in the same _or different_ filesystem; the intermediate
														
 
															+manifests may be renamed to a location in the target filesystem.
														
 
															+
														
 
															+```
														
 
															+if summary.report.directory != "" then
														
 
															+  success.save("${summary.report.directory}/$jobID.json")
														
 
															+if diagnostics.manifest.directory != null then
														
 
															+  rename($manifestDirectory, "${diagnostics.manifest.directory}/$jobID")
														
 
															+```
														
 
															+
														
 
															+The summary report is saved even if job commit fails for any reason
														
 
															+
														
 
															+### Job Abort/cleanup
														
 
															+
														
 
															+Job cleanup is nominally one of deleting the job directory
														
 
															+```
														
 
															+delete(jobDirectory)
														
 
															+```
														
 
															+
														
 
															+To address scale issues with the object stores, this SHALL be preceeded by
														
 
															+a (parallelized) deletion of all task attempt working directories
														
 
															+
														
 
															+```
														
 
															+let taskAttemptWorkingDirectories = list("taskAttemptDirectory")
														
 
															+taskAttemptWorkingDirectories.map(p -> delete(p))
														
 
															+```
														
 
															+
														
 
															+
														
 
															+## Benefits of the new protocol
														
 
															+
														
 
															+* Pushes the source tree list operations into the task commit phase, which is
														
 
															+  generally off the critical path of execution.
														
 
															+* Reduces the number of directories probed/created to the aggregate set of
														
 
															+  output directories, with all duplicates eliminated.
														
 
															+* File rename can be parallelized, with the limits being that of configured
														
 
															+  thread pool sizes and/or any rate limiting constraints.
														
 
															+* Provides an atomic task commit to GCS, as there is no expectation that
														
 
															+  directory rename is atomic.
														
 
															+* Permits pass IOStatistics from tasks attempts to the job committer via the manifests.
														
 
															+* Allows for some pre-rename operations in the Job Committer
														
 
															+  similar to the S3A "Partitioned Staging committer".
														
 
															+  This can be configured to delete all existing entries in
														
 
															+  directories scheduled to be created -or fail if those partitions are
														
 
															+  non-empty.
														
 
															+  See [Partitioned Staging Committer](../../hadoop-aws/tools/hadoop-aws/committers.html#The_.E2.80.9CPartitioned.E2.80.9D_Staging_Committer)
														
 
															+* Allows for an optional preflight validation check (verify no duplicate files created by different tasks).
														
 
															+* Manifests can be viewed, size of output determined, etc., during development/debugging.
														
 
															+
														
 
															+## Disadvantages of the new protocol compared to the v1 algorithm
														
 
															+
														
 
															+* Needs a new manifest file format.
														
 
															+* Manifests may get large if tasks create many files and/or subdirectories, or if
														
 
															+  etags are collected and the length of these tags is significant.
														
 
															+  The HTTP protocol limits each etag to 8 KiB, so the cost may be 8 KiB per file.
														
 
															+* Makes task commit more complex than the v1 algorithm.
														
 
															+* Possibly suboptimal on jobs where individual tasks create unique output directories,
														
 
															+  as directory rename will never be used to commit a directory.
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/AbstractManifestCommitterTest.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/AbstractManifestCommitterTest.java
@@ -0,0 +1,1085 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.File;
														
 
															+import java.io.IOException;
														
 
															+import java.net.URI;
														
 
															+import java.nio.charset.StandardCharsets;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.Collection;
														
 
															+import java.util.Collections;
														
 
															+import java.util.List;
														
 
															+import java.util.Map;
														
 
															+import java.util.Set;
														
 
															+import java.util.concurrent.CompletableFuture;
														
 
															+import java.util.concurrent.ExecutorService;
														
 
															+import java.util.concurrent.Future;
														
 
															+import java.util.concurrent.atomic.AtomicLong;
														
 
															+import java.util.stream.Collectors;
														
 
															+import java.util.stream.IntStream;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+import org.junit.AfterClass;
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.conf.Configuration;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.contract.AbstractFSContract;
														
 
															+import org.apache.hadoop.fs.contract.AbstractFSContractTestBase;
														
 
															+import org.apache.hadoop.fs.contract.ContractTestUtils;
														
 
															+import org.apache.hadoop.fs.contract.localfs.LocalFSContract;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.io.IOUtils;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SaveTaskManifestStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+import org.apache.hadoop.util.DurationInfo;
														
 
															+import org.apache.hadoop.util.Progressable;
														
 
															+import org.apache.hadoop.util.functional.CloseableTaskPoolSubmitter;
														
 
															+import org.apache.hadoop.util.functional.RemoteIterators;
														
 
															+import org.apache.hadoop.util.functional.TaskPool;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.contract.ContractTestUtils.readDataset;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory.COMMITTER_FACTORY_CLASS;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConfig.createCloseableTaskSubmitter;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_ID_SOURCE_MAPREDUCE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_FACTORY;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_DIAGNOSTICS_MANIFEST_DIR;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_SUMMARY_REPORT_DIR;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_VALIDATE_OUTPUT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.getProjectBuildDir;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.validateSuccessFile;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.NAME_FORMAT_JOB_ATTEMPT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createIOStatisticsStore;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createTaskManifest;
														
 
															+import static org.apache.hadoop.util.functional.FutureIO.awaitFuture;
														
 
															+
														
 
															+/**
														
 
															+ * Tests which work with manifest committers.
														
 
															+ * This is a filesystem contract bound to the local filesystem;
														
 
															+ * subclasses may change the FS to test against other stores.
														
 
															+ * Some fields are set up in
														
 
															+ * in {@link #executeOneTaskAttempt(int, int, int)},
														
 
															+ * which is why fields are used.
														
 
															+ * when synchronized access is needed; synchronize on (this) rather
														
 
															+ * than individual fields
														
 
															+ */
														
 
															+public abstract class AbstractManifestCommitterTest
														
 
															+    extends AbstractFSContractTestBase {
														
 
															+
														
 
															+  protected static final Logger LOG =
														
 
															+      LoggerFactory.getLogger(AbstractManifestCommitterTest.class);
														
 
															+
														
 
															+  /**
														
 
															+   * Some Job and task IDs.
														
 
															+   */
														
 
															+  protected static final ManifestCommitterTestSupport.JobAndTaskIDsForTests
														
 
															+      TASK_IDS = new ManifestCommitterTestSupport.JobAndTaskIDsForTests(2, 2);
														
 
															+
														
 
															+  public static final int JOB1 = 1;
														
 
															+
														
 
															+  public static final int TASK0 = 0;
														
 
															+
														
 
															+  public static final int TASK1 = 1;
														
 
															+
														
 
															+  /**
														
 
															+   * Task attempt 0 index.
														
 
															+   */
														
 
															+  public static final int TA0 = 0;
														
 
															+
														
 
															+  /**
														
 
															+   * Task attempt 1 index.
														
 
															+   */
														
 
															+  public static final int TA1 = 1;
														
 
															+
														
 
															+  /**
														
 
															+   * Depth of dir tree to generate.
														
 
															+   */
														
 
															+  public static final int DEPTH = 3;
														
 
															+
														
 
															+  /**
														
 
															+   * Width of dir tree at every level.
														
 
															+   */
														
 
															+  public static final int WIDTH = 2;
														
 
															+
														
 
															+  /**
														
 
															+   * How many files to create in the leaf directories.
														
 
															+   */
														
 
															+  public static final int FILES_PER_DIRECTORY = 4;
														
 
															+
														
 
															+  /**
														
 
															+   * Pool size.
														
 
															+   */
														
 
															+  public static final int POOL_SIZE = 32;
														
 
															+
														
 
															+  /**
														
 
															+   * FileSystem statistics are collected across every test case.
														
 
															+   */
														
 
															+  protected static final IOStatisticsSnapshot FILESYSTEM_IOSTATS =
														
 
															+      snapshotIOStatistics();
														
 
															+
														
 
															+  /**
														
 
															+   * Counter for creating files. Ensures that across all test suites,
														
 
															+   * duplicate filenames are never created. Helps assign blame.
														
 
															+   */
														
 
															+  private static final AtomicLong CREATE_FILE_COUNTER = new AtomicLong();
														
 
															+
														
 
															+  protected static final byte[] NO_DATA = new byte[0];
														
 
															+
														
 
															+  /**
														
 
															+   * The thread leak tracker.
														
 
															+   */
														
 
															+  private static final ThreadLeakTracker THREAD_LEAK_TRACKER = new ThreadLeakTracker();
														
 
															+
														
 
															+  private static final int MAX_LEN = 64_000;
														
 
															+
														
 
															+  /**
														
 
															+   * Submitter for tasks; may be null.
														
 
															+   */
														
 
															+  private CloseableTaskPoolSubmitter submitter;
														
 
															+
														
 
															+  /**
														
 
															+   * Stage statistics. Created in test setup, and in
														
 
															+   * teardown updates {@link #FILESYSTEM_IOSTATS}.
														
 
															+   */
														
 
															+  private IOStatisticsStore stageStatistics;
														
 
															+
														
 
															+  /**
														
 
															+   * Prefer to use these to interact with the FS to
														
 
															+   * ensure more implicit coverage.
														
 
															+   */
														
 
															+  private ManifestStoreOperations storeOperations;
														
 
															+
														
 
															+  /**
														
 
															+   * Progress counter used in all stage configs.
														
 
															+   */
														
 
															+  private final ProgressCounter progressCounter = new ProgressCounter();
														
 
															+
														
 
															+  /**
														
 
															+   * Directory for job summary reports.
														
 
															+   * This should be set up in test suites testing against real object stores.
														
 
															+   */
														
 
															+  private File reportDir;
														
 
															+
														
 
															+  /**
														
 
															+   * List of task attempt IDs for those tests which create manifests.
														
 
															+   */
														
 
															+  private final List<String> taskAttemptIds = new ArrayList<>();
														
 
															+
														
 
															+  /**
														
 
															+   * List of task IDs for those tests which create manifests.
														
 
															+   */
														
 
															+  private final List<String> taskIds = new ArrayList<>();
														
 
															+
														
 
															+  /**
														
 
															+   * any job stage configuration created for operations.
														
 
															+   */
														
 
															+  private StageConfig jobStageConfig;
														
 
															+
														
 
															+  /**
														
 
															+   * Destination dir of job.
														
 
															+   */
														
 
															+  private Path destDir;
														
 
															+
														
 
															+  /**
														
 
															+   * When creating manifests, total data size.
														
 
															+   */
														
 
															+  private final AtomicLong totalDataSize = new AtomicLong();
														
 
															+
														
 
															+  /**
														
 
															+   * Where to move manifests; must be in target FS.
														
 
															+   */
														
 
															+  private Path manifestDir;
														
 
															+
														
 
															+  /**
														
 
															+   * Get the contract configuration.
														
 
															+   * @return the config used to create the FS.
														
 
															+   */
														
 
															+  protected Configuration getConfiguration() {
														
 
															+    return getContract().getConf();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Store operations to interact with..
														
 
															+   * @return store operations.
														
 
															+   */
														
 
															+  protected ManifestStoreOperations getStoreOperations() {
														
 
															+    return storeOperations;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set store operations.
														
 
															+   * @param storeOperations new value
														
 
															+   */
														
 
															+  protected void setStoreOperations(final ManifestStoreOperations storeOperations) {
														
 
															+    this.storeOperations = storeOperations;
														
 
															+  }
														
 
															+
														
 
															+  public List<String> getTaskAttemptIds() {
														
 
															+    return taskAttemptIds;
														
 
															+  }
														
 
															+
														
 
															+  public List<String> getTaskIds() {
														
 
															+    return taskIds;
														
 
															+  }
														
 
															+
														
 
															+  public long getTotalDataSize() {
														
 
															+    return totalDataSize.get();
														
 
															+  }
														
 
															+
														
 
															+  public Path getManifestDir() {
														
 
															+    return manifestDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set builder value.
														
 
															+   * @param value new value
														
 
															+   * @return the builder
														
 
															+   */
														
 
															+  public AbstractManifestCommitterTest withManifestDir(Path value) {
														
 
															+    manifestDir = value;
														
 
															+    return this;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Describe a test in the logs.
														
 
															+   * @param text text to print
														
 
															+   * @param args arguments to format in the printing
														
 
															+   */
														
 
															+  protected void describe(String text, Object... args) {
														
 
															+    LOG.info("\n\n{}: {}\n",
														
 
															+        getMethodName(),
														
 
															+        String.format(text, args));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Local FS unless overridden.
														
 
															+   * @param conf configuration
														
 
															+   * @return the FS contract.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected AbstractFSContract createContract(final Configuration conf) {
														
 
															+    return new LocalFSContract(conf);
														
 
															+  }
														
 
															+
														
 
															+  /** Enable the manifest committer options in the configuration. */
														
 
															+  @Override
														
 
															+  protected Configuration createConfiguration() {
														
 
															+    return enableManifestCommitter(super.createConfiguration());
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void setup() throws Exception {
														
 
															+
														
 
															+    // set the manifest committer to a localfs path for reports across
														
 
															+    // all threads.
														
 
															+    // do this before superclass setup so reportDir is non-null there
														
 
															+    // and can be used in creating the configuration.
														
 
															+    reportDir = new File(getProjectBuildDir(), "reports");
														
 
															+    reportDir.mkdirs();
														
 
															+
														
 
															+    // superclass setup includes creating a filesystem instance
														
 
															+    // for the target store.
														
 
															+    super.setup();
														
 
															+
														
 
															+    manifestDir = path("manifests");
														
 
															+
														
 
															+    // destination directory defaults to method path in
														
 
															+    // target FS
														
 
															+    setDestDir(methodPath());
														
 
															+
														
 
															+    // stage statistics
														
 
															+    setStageStatistics(createIOStatisticsStore().build());
														
 
															+    // thread pool for task submission.
														
 
															+    setSubmitter(createCloseableTaskSubmitter(POOL_SIZE, TASK_IDS.getJobId()));
														
 
															+    // store operations for the target filesystem.
														
 
															+    storeOperations = createManifestStoreOperations();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Overrride point: create the store operations.
														
 
															+   * @return store operations for this suite.
														
 
															+   */
														
 
															+  protected ManifestStoreOperations createManifestStoreOperations() throws IOException {
														
 
															+    final FileSystem fs = getFileSystem();
														
 
															+    return ManifestCommitterSupport.createManifestStoreOperations(fs.getConf(), fs, getTestPath());
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void teardown() throws Exception {
														
 
															+    Thread.currentThread().setName("teardown");
														
 
															+
														
 
															+    IOUtils.cleanupWithLogger(LOG, storeOperations, getSubmitter());
														
 
															+    storeOperations = null;
														
 
															+    super.teardown();
														
 
															+    FILESYSTEM_IOSTATS.aggregate(retrieveIOStatistics(getFileSystem()));
														
 
															+    FILESYSTEM_IOSTATS.aggregate(getStageStatistics());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a long delay so that you don't get timeouts when working
														
 
															+   * with object stores or debugging.
														
 
															+   * @return a longer timeout than the base classes.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected int getTestTimeoutMillis() {
														
 
															+    return 600_000;
														
 
															+  }
														
 
															+
														
 
															+  protected Path getTestPath() {
														
 
															+    return getContract().getTestPath();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the task submitter.
														
 
															+   * @return submitter or null
														
 
															+   */
														
 
															+  protected CloseableTaskPoolSubmitter getSubmitter() {
														
 
															+    return submitter;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the task submitter.
														
 
															+   * @param submitter new value.
														
 
															+   */
														
 
															+  protected void setSubmitter(CloseableTaskPoolSubmitter submitter) {
														
 
															+    this.submitter = submitter;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the executor which the submitter also uses.
														
 
															+   * @return an executor.
														
 
															+   */
														
 
															+  protected ExecutorService getExecutorService() {
														
 
															+    return getSubmitter().getPool();
														
 
															+  }
														
 
															+  /**
														
 
															+   * @return IOStatistics for stage.
														
 
															+   */
														
 
															+  protected final IOStatisticsStore getStageStatistics() {
														
 
															+    return stageStatistics;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the statistics.
														
 
															+   * @param stageStatistics statistics.
														
 
															+   */
														
 
															+  protected final void setStageStatistics(IOStatisticsStore stageStatistics) {
														
 
															+    this.stageStatistics = stageStatistics;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the progress counter invoked during commit operations.
														
 
															+   * @return progress counter.
														
 
															+   */
														
 
															+  protected final ProgressCounter getProgressCounter() {
														
 
															+    return progressCounter;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the report directory.
														
 
															+   * @return report directory.
														
 
															+   */
														
 
															+  public final File getReportDir() {
														
 
															+    return reportDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the report directory as a URI.
														
 
															+   * @return report directory.
														
 
															+   */
														
 
															+  public final URI getReportDirUri() {
														
 
															+    return getReportDir().toURI();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the (shared) thread leak tracker.
														
 
															+   * @return the thread leak tracker.
														
 
															+   */
														
 
															+  protected static ThreadLeakTracker getThreadLeakTracker() {
														
 
															+    return THREAD_LEAK_TRACKER;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Make sure there's no thread leakage.
														
 
															+   */
														
 
															+  @AfterClass
														
 
															+  public static void threadLeakage() {
														
 
															+    THREAD_LEAK_TRACKER.assertNoThreadLeakage();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Dump the filesystem statistics after the class.
														
 
															+   */
														
 
															+  @AfterClass
														
 
															+  public static void dumpFileSystemIOStatistics() {
														
 
															+    LOG.info("Aggregate FileSystem Statistics {}",
														
 
															+        ioStatisticsToPrettyString(FILESYSTEM_IOSTATS));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a directory tree through an executor.
														
 
															+   * dirs created = width^depth;
														
 
															+   * file count = width^depth * files
														
 
															+   * If createDirs == true, then directories are created at the bottom,
														
 
															+   * not files.
														
 
															+   * @param base base dir
														
 
															+   * @param prefix prefix for filenames.
														
 
															+   * @param executor submitter.
														
 
															+   * @param depth depth of dirs
														
 
															+   * @param width width of dirs
														
 
															+   * @param files files to add in each base dir.
														
 
															+   * @param createDirs create directories rather than files?
														
 
															+   * @return the list of paths
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  public final List<Path> createFilesOrDirs(Path base,
														
 
															+      String prefix,
														
 
															+      ExecutorService executor,
														
 
															+      int depth,
														
 
															+      int width,
														
 
															+      int files,
														
 
															+      boolean createDirs) throws IOException {
														
 
															+
														
 
															+    try (DurationInfo ignored = new DurationInfo(LOG, true,
														
 
															+        "Creating Files %s (%d, %d, %d) under %s",
														
 
															+        prefix, depth, width, files, base)) {
														
 
															+
														
 
															+      assertPathExists("Task attempt dir", base);
														
 
															+
														
 
															+      // create the files in the thread pool.
														
 
															+      List<Future<Path>> futures = createFilesOrDirs(
														
 
															+          new ArrayList<>(),
														
 
															+          base, prefix,
														
 
															+          executor,
														
 
															+          depth, width, files,
														
 
															+          createDirs);
														
 
															+      List<Path> result = new ArrayList<>();
														
 
															+
														
 
															+      // now wait for the creations to finish.
														
 
															+      for (Future<Path> f : futures) {
														
 
															+        result.add(awaitFuture(f));
														
 
															+      }
														
 
															+      return result;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Counter incremented for each file created.
														
 
															+   */
														
 
															+  private final AtomicLong fileDataGenerator = new AtomicLong();
														
 
															+
														
 
															+  /**
														
 
															+   * Create files or directories; done in a treewalk and building up
														
 
															+   * a list of futures to wait for. The list is
														
 
															+   * build up incrementally rather than through some merging of
														
 
															+   * lists created down the tree.
														
 
															+   * If createDirs == true, then directories are created at the bottom,
														
 
															+   * not files.
														
 
															+   *
														
 
															+   * @param futures list of futures to build up.
														
 
															+   * @param base base dir
														
 
															+   * @param prefix prefix for filenames.
														
 
															+   * @param executor submitter.
														
 
															+   * @param depth depth of dirs
														
 
															+   * @param width width of dirs
														
 
															+   * @param files files to add in each base dir.
														
 
															+   * @param createDirs create directories rather than files?
														
 
															+   * @return the list of futures
														
 
															+   */
														
 
															+  private List<Future<Path>> createFilesOrDirs(
														
 
															+      List<Future<Path>> futures,
														
 
															+      Path base,
														
 
															+      String prefix,
														
 
															+      ExecutorService executor,
														
 
															+      int depth,
														
 
															+      int width,
														
 
															+      int files,
														
 
															+      boolean createDirs) {
														
 
															+
														
 
															+    if (depth > 0) {
														
 
															+      // still creating directories
														
 
															+      for (int i = 0; i < width; i++) {
														
 
															+        Path child = new Path(base,
														
 
															+            String.format("dir-%02d-%02d", depth, i));
														
 
															+        createFilesOrDirs(futures, child, prefix, executor, depth - 1, width, files, false);
														
 
															+      }
														
 
															+    } else {
														
 
															+      // time to create files
														
 
															+      for (int i = 0; i < files; i++) {
														
 
															+        Path file = new Path(base,
														
 
															+            String.format("%s-%04d", prefix,
														
 
															+                CREATE_FILE_COUNTER.incrementAndGet()));
														
 
															+        // buld the data. Not actually used in mkdir.
														
 
															+        long entry = fileDataGenerator.incrementAndGet() & 0xffff;
														
 
															+        byte[] data = new byte[2];
														
 
															+        data[0] = (byte) (entry & 0xff);
														
 
															+        data[1] = (byte) ((entry & 0xff00) >> 8);
														
 
															+        // the async operation.
														
 
															+        Future<Path> f = executor.submit(() -> {
														
 
															+          if (!createDirs) {
														
 
															+            // create files
														
 
															+            ContractTestUtils.createFile(getFileSystem(), file, true, data);
														
 
															+          } else {
														
 
															+            // create directories
														
 
															+            mkdirs(file);
														
 
															+          }
														
 
															+          return file;
														
 
															+        });
														
 
															+        futures.add(f);
														
 
															+      }
														
 
															+    }
														
 
															+    return futures;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a list of paths under a dir.
														
 
															+   * @param base base dir
														
 
															+   * @param count count
														
 
															+   * @return the list
														
 
															+   */
														
 
															+  protected List<Path> subpaths(Path base, int count) {
														
 
															+    return IntStream.rangeClosed(1, count)
														
 
															+        .mapToObj(i -> new Path(base, String.format("entry-%02d", i)))
														
 
															+        .collect(Collectors.toList());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Submit a mkdir call to the executor pool.
														
 
															+   * @param path path of dir to create.
														
 
															+   * @return future
														
 
															+   */
														
 
															+  protected CompletableFuture<Path> asyncMkdir(final Path path) {
														
 
															+    CompletableFuture<Path> f = new CompletableFuture<>();
														
 
															+    getExecutorService().submit(() -> {
														
 
															+      try {
														
 
															+        mkdirs(path);
														
 
															+        f.complete(path);
														
 
															+      } catch (IOException e) {
														
 
															+        f.completeExceptionally(e);
														
 
															+      }
														
 
															+    });
														
 
															+    return f;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Given a list of paths, create the dirs async.
														
 
															+   * @param paths path list
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  protected void asyncMkdirs(Collection<Path> paths) throws IOException {
														
 
															+    List<CompletableFuture<Path>> futures = new ArrayList<>();
														
 
															+    // initiate
														
 
															+    for (Path path: paths) {
														
 
															+      futures.add(asyncMkdir(path));
														
 
															+    }
														
 
															+    // await
														
 
															+    for (Future<Path> f : futures) {
														
 
															+      awaitFuture(f);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Submit an oepration to create a file to the executor pool.
														
 
															+   * @param path path of file to create.
														
 
															+   * @return future
														
 
															+   */
														
 
															+  protected CompletableFuture<Path> asyncPut(final Path path, byte[] data) {
														
 
															+    CompletableFuture<Path> f = new CompletableFuture<>();
														
 
															+    getExecutorService().submit(() -> {
														
 
															+      try {
														
 
															+        ContractTestUtils.createFile(getFileSystem(), path, true, data);
														
 
															+        f.complete(path);
														
 
															+      } catch (IOException e) {
														
 
															+        f.completeExceptionally(e);
														
 
															+      }
														
 
															+    });
														
 
															+    return f;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Convert the manifest list to a map by task attempt ID.
														
 
															+   * @param list manifests
														
 
															+   * @return a map, indexed by task attempt ID.
														
 
															+   */
														
 
															+  protected Map<String, TaskManifest> toMap(List<TaskManifest> list) {
														
 
															+    return list.stream()
														
 
															+        .collect(Collectors.toMap(TaskManifest::getTaskAttemptID, x -> x));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify the manifest files match the list of paths.
														
 
															+   * @param manifest manifest to audit
														
 
															+   * @param files list of files.
														
 
															+   */
														
 
															+  protected void verifyManifestFilesMatch(final TaskManifest manifest,
														
 
															+      final List<Path> files) {
														
 
															+    // get the list of source paths
														
 
															+    Set<Path> filesToRename = manifest.getFilesToCommit()
														
 
															+        .stream()
														
 
															+        .map(FileEntry::getSourcePath)
														
 
															+        .collect(Collectors.toSet());
														
 
															+    // which must match that of all the files created
														
 
															+    Assertions.assertThat(filesToRename)
														
 
															+        .containsExactlyInAnyOrderElementsOf(files);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify that a task manifest has a given attempt ID.
														
 
															+   * @param manifest manifest, may be null.
														
 
															+   * @param attemptId expected attempt ID
														
 
															+   * @return the manifest, guaranteed to be non-null and of task attempt.
														
 
															+   */
														
 
															+  protected TaskManifest verifyManifestTaskAttemptID(
														
 
															+      final TaskManifest manifest,
														
 
															+      final String attemptId) {
														
 
															+    Assertions.assertThat(manifest)
														
 
															+        .describedAs("Manifest of task %s", attemptId)
														
 
															+        .isNotNull();
														
 
															+    Assertions.assertThat(manifest.getTaskAttemptID())
														
 
															+        .describedAs("Task Attempt ID of manifest %s", manifest)
														
 
															+        .isEqualTo(attemptId);
														
 
															+    return manifest;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that a path must exist; return the path.
														
 
															+   * @param message text for error message.
														
 
															+   * @param path path to validate.
														
 
															+   * @return the path
														
 
															+   * @throws IOException IO Failure
														
 
															+   */
														
 
															+  Path pathMustExist(final String message,
														
 
															+      final Path path) throws IOException {
														
 
															+    assertPathExists(message, path);
														
 
															+    return path;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that a path must exist; return the path.
														
 
															+   * It must also equal the expected value.
														
 
															+   * @param message text for error message.
														
 
															+   * @param expectedPath expected path.
														
 
															+   * @param actualPath path to validate.
														
 
															+   * @return the path
														
 
															+   * @throws IOException IO Failure
														
 
															+   */
														
 
															+  Path verifyPath(final String message,
														
 
															+      final Path expectedPath,
														
 
															+      final Path actualPath) throws IOException {
														
 
															+    Assertions.assertThat(actualPath)
														
 
															+        .describedAs(message)
														
 
															+        .isEqualTo(expectedPath);
														
 
															+    return pathMustExist(message, actualPath);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify that the specified dir has the {@code _SUCCESS} marker
														
 
															+   * and that it can be loaded.
														
 
															+   * The contents will be logged and returned.
														
 
															+   * @param dir directory to scan
														
 
															+   * @param jobId job ID, only verified if non-empty
														
 
															+   * @return the loaded success data
														
 
															+   * @throws IOException IO Failure
														
 
															+   */
														
 
															+  protected ManifestSuccessData verifySuccessMarker(Path dir, String jobId)
														
 
															+      throws IOException {
														
 
															+    return validateSuccessFile(getFileSystem(), dir, 0, jobId);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Read a UTF-8 file.
														
 
															+   * @param path path to read
														
 
															+   * @return string value
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  protected String readFile(Path path) throws IOException {
														
 
															+    return ContractTestUtils.readUTF8(getFileSystem(), path, -1);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Modify a (job) config to switch to the manifest committer;
														
 
															+   * output validation is also enabled.
														
 
															+   * @param conf config to patch.
														
 
															+   * @return the updated configuration.
														
 
															+   */
														
 
															+  protected Configuration enableManifestCommitter(final Configuration conf) {
														
 
															+    conf.set(COMMITTER_FACTORY_CLASS, MANIFEST_COMMITTER_FACTORY);
														
 
															+    // always create a job marker
														
 
															+    conf.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true);
														
 
															+    // and validate the output, for extra rigorousness
														
 
															+    conf.setBoolean(OPT_VALIDATE_OUTPUT, true);
														
 
															+
														
 
															+    // set the manifest rename dir if non-null
														
 
															+    if (getManifestDir() != null) {
														
 
															+      conf.set(OPT_DIAGNOSTICS_MANIFEST_DIR,
														
 
															+          getManifestDir().toUri().toString());
														
 
															+    }
														
 
															+
														
 
															+    // and bind the report dir
														
 
															+    conf.set(OPT_SUMMARY_REPORT_DIR, getReportDirUri().toString());
														
 
															+    return conf;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create the stage config for a job but don't finalize it.
														
 
															+   * Uses {@link #TASK_IDS} for job/task ID.
														
 
															+   * @param jobAttemptNumber job attempt number
														
 
															+   * @param outputPath path where the final output goes
														
 
															+   * @return the config
														
 
															+   */
														
 
															+  protected StageConfig createStageConfigForJob(
														
 
															+      final int jobAttemptNumber,
														
 
															+      final Path outputPath) {
														
 
															+    return createStageConfig(jobAttemptNumber, -1, 0, outputPath);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create the stage config for job or task but don't finalize it.
														
 
															+   * Uses {@link #TASK_IDS} for job/task ID.
														
 
															+   * @param jobAttemptNumber job attempt number
														
 
															+   * @param taskIndex task attempt index; -1 for job attempt only.
														
 
															+   * @param taskAttemptNumber task attempt number
														
 
															+   * @param outputPath path where the final output goes
														
 
															+   * @return the config
														
 
															+   */
														
 
															+  protected StageConfig createStageConfig(
														
 
															+      final int jobAttemptNumber,
														
 
															+      final int taskIndex,
														
 
															+      final int taskAttemptNumber,
														
 
															+      final Path outputPath) {
														
 
															+    final String jobId = TASK_IDS.getJobId();
														
 
															+    ManifestCommitterSupport.AttemptDirectories attemptDirs =
														
 
															+        new ManifestCommitterSupport.AttemptDirectories(outputPath,
														
 
															+            jobId, jobAttemptNumber);
														
 
															+    StageConfig config = new StageConfig();
														
 
															+    config
														
 
															+        .withIOProcessors(getSubmitter())
														
 
															+        .withIOStatistics(getStageStatistics())
														
 
															+        .withJobId(jobId)
														
 
															+        .withJobIdSource(JOB_ID_SOURCE_MAPREDUCE)
														
 
															+        .withJobAttemptNumber(jobAttemptNumber)
														
 
															+        .withJobDirectories(attemptDirs)
														
 
															+        .withName(String.format(NAME_FORMAT_JOB_ATTEMPT, jobId))
														
 
															+        .withOperations(getStoreOperations())
														
 
															+        .withProgressable(getProgressCounter());
														
 
															+
														
 
															+    // if there's a task attempt ID set, set up its details
														
 
															+    if (taskIndex >= 0) {
														
 
															+      String taskAttempt = TASK_IDS.getTaskAttempt(taskIndex,
														
 
															+          taskAttemptNumber);
														
 
															+      config
														
 
															+          .withTaskAttemptId(taskAttempt)
														
 
															+          .withTaskId(TASK_IDS.getTaskIdType(taskIndex).toString())
														
 
															+          .withTaskAttemptDir(
														
 
															+              attemptDirs.getTaskAttemptPath(taskAttempt));
														
 
															+    }
														
 
															+    return config;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * A job stage config.
														
 
															+   * @return stage config or null.
														
 
															+   */
														
 
															+  protected StageConfig getJobStageConfig() {
														
 
															+    return jobStageConfig;
														
 
															+  }
														
 
															+
														
 
															+  protected void setJobStageConfig(StageConfig jobStageConfig) {
														
 
															+    this.jobStageConfig = jobStageConfig;
														
 
															+  }
														
 
															+
														
 
															+  protected Path getDestDir() {
														
 
															+    return destDir;
														
 
															+  }
														
 
															+
														
 
															+  protected void setDestDir(Path destDir) {
														
 
															+    this.destDir = destDir;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute a set of tasks; task ID is a simple count.
														
 
															+   * task attempt is lowest 2 bits of task ID.
														
 
															+   * @param taskAttemptCount number of tasks.
														
 
															+   * @param filesPerTaskAttempt number of files to include in manifest.
														
 
															+   * @return the manifests.
														
 
															+   * @throws IOException IO failure.
														
 
															+   */
														
 
															+  protected List<TaskManifest> executeTaskAttempts(int taskAttemptCount,
														
 
															+      int filesPerTaskAttempt) throws IOException {
														
 
															+
														
 
															+    try (DurationInfo di = new DurationInfo(LOG, true, "create manifests")) {
														
 
															+
														
 
															+      // build a list of the task IDs.
														
 
															+      // it's really hard to create a list of Integers; the java8
														
 
															+      // IntStream etc doesn't quite fit as they do their best
														
 
															+      // keep things unboxed, trying to map(Integer::valueOf) doesn't help.
														
 
															+      List<Integer> taskIdList = new ArrayList<>(taskAttemptCount);
														
 
															+      for (int t = 0; t < taskAttemptCount; t++) {
														
 
															+        taskIdList.add(t);
														
 
															+      }
														
 
															+
														
 
															+      /// execute the tasks
														
 
															+      List<TaskManifest> manifests = Collections.synchronizedList(
														
 
															+          new ArrayList<>());
														
 
															+
														
 
															+      // then submit their creation/save to the pool.
														
 
															+      TaskPool.foreach(taskIdList)
														
 
															+          .executeWith(getSubmitter())
														
 
															+          .stopOnFailure()
														
 
															+          .run(i -> {
														
 
															+            manifests.add(
														
 
															+                executeOneTaskAttempt(i, i & 0x03, filesPerTaskAttempt));
														
 
															+          });
														
 
															+      return manifests;
														
 
															+
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create at task ID and attempt (adding to taskIDs and taskAttemptIds)
														
 
															+   * setup the task, create a manifest with fake task entries
														
 
															+   * and save that manifest to the job attempt dir.
														
 
															+   * No actual files are created.
														
 
															+   * @param task task index
														
 
															+   * @param taskAttempt task attempt value
														
 
															+   * @param filesPerTaskAttempt number of files to include in manifest.
														
 
															+   * @return the manifest
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  protected TaskManifest executeOneTaskAttempt(final int task,
														
 
															+      int taskAttempt, final int filesPerTaskAttempt) throws IOException {
														
 
															+
														
 
															+    String tid = String.format("task_%03d", task);
														
 
															+    String taskAttemptId = String.format("%s_%02d",
														
 
															+        tid, taskAttempt);
														
 
															+    synchronized (this) {
														
 
															+      taskIds.add(tid);
														
 
															+      taskAttemptIds.add(taskAttemptId);
														
 
															+    }
														
 
															+    // for each task, a job config is created then patched with the task info
														
 
															+    StageConfig taskStageConfig = createTaskStageConfig(JOB1, tid, taskAttemptId);
														
 
															+
														
 
															+    LOG.info("Generating manifest for {}", taskAttemptId);
														
 
															+
														
 
															+    // task setup: create dest dir.
														
 
															+    // This helps generate a realistic
														
 
															+    // workload for the parallelized job cleanup.
														
 
															+    new SetupTaskStage(taskStageConfig).apply("task " + taskAttemptId);
														
 
															+
														
 
															+    final TaskManifest manifest = createTaskManifest(taskStageConfig);
														
 
															+
														
 
															+    Path taDir = taskStageConfig.getTaskAttemptDir();
														
 
															+    long size = task * 1000_0000L;
														
 
															+
														
 
															+    // for each task, 10 dirs, one file per dir.
														
 
															+    for (int i = 0; i < filesPerTaskAttempt; i++) {
														
 
															+      Path in = new Path(taDir, "dir-" + i);
														
 
															+      Path out = new Path(getDestDir(), "dir-" + i);
														
 
															+      manifest.addDirectory(DirEntry.dirEntry(out, 0, 1));
														
 
															+      String name = taskStageConfig.getTaskAttemptId() + ".csv";
														
 
															+      Path src = new Path(in, name);
														
 
															+      Path dest = new Path(out, name);
														
 
															+      long fileSize = size + i * 1000L;
														
 
															+      manifest.addFileToCommit(
														
 
															+          new FileEntry(src, dest, fileSize, Long.toString(fileSize, 16)));
														
 
															+      totalDataSize.addAndGet(fileSize);
														
 
															+    }
														
 
															+
														
 
															+    // save the manifest for this stage.
														
 
															+    new SaveTaskManifestStage(taskStageConfig).apply(manifest);
														
 
															+    return manifest;
														
 
															+  }
														
 
															+
														
 
															+  public StageConfig createTaskStageConfig(final int jobId, final String tid,
														
 
															+      final String taskAttemptId) {
														
 
															+    Path jobAttemptTaskSubDir = getJobStageConfig().getJobAttemptTaskSubDir();
														
 
															+    StageConfig taskStageConfig = createStageConfigForJob(jobId, getDestDir())
														
 
															+        .withTaskId(tid)
														
 
															+        .withTaskAttemptId(taskAttemptId)
														
 
															+        .withTaskAttemptDir(new Path(jobAttemptTaskSubDir, taskAttemptId));
														
 
															+    return taskStageConfig;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify that the job directories have been cleaned up.
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  protected void verifyJobDirsCleanedUp() throws IOException {
														
 
															+    StageConfig stageConfig = getJobStageConfig();
														
 
															+    assertPathDoesNotExist("Job attempt dir", stageConfig.getJobAttemptDir());
														
 
															+    assertPathDoesNotExist("dest temp dir", stageConfig.getOutputTempSubDir());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * List a directory/directory tree and print files.
														
 
															+   * @param fileSystem FS
														
 
															+   * @param path path
														
 
															+   * @param recursive do a recursive listing?
														
 
															+   * @return the number of files found.
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  public static long lsR(FileSystem fileSystem, Path path, boolean recursive)
														
 
															+      throws Exception {
														
 
															+    if (path == null) {
														
 
															+      // surfaces when someone calls getParent() on something at the top
														
 
															+      // of the path
														
 
															+      LOG.info("Empty path");
														
 
															+      return 0;
														
 
															+    } else {
														
 
															+      LOG.info("Listing of {}", path);
														
 
															+      final long count = RemoteIterators.foreach(
														
 
															+          fileSystem.listFiles(path, recursive),
														
 
															+          (status) -> LOG.info("{}", status));
														
 
															+      LOG.info("Count of entries: {}", count);
														
 
															+      return count;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that a cleanup stage coursehad a given outcome and
														
 
															+   * deleted the given number of directories.
														
 
															+   * @param result result to analyze
														
 
															+   * @param outcome expected outcome
														
 
															+   * @param expectedDirsDeleted #of directories deleted. -1 for no checks
														
 
															+   */
														
 
															+  protected void assertCleanupResult(
														
 
															+      CleanupJobStage.Result result,
														
 
															+      CleanupJobStage.Outcome outcome,
														
 
															+      int expectedDirsDeleted) {
														
 
															+    Assertions.assertThat(result.getOutcome())
														
 
															+        .describedAs("Outcome of cleanup() in %s", result)
														
 
															+        .isEqualTo(outcome);
														
 
															+    if (expectedDirsDeleted >= 0) {
														
 
															+      Assertions.assertThat(result.getDeleteCalls())
														
 
															+          .describedAs("Number of directories deleted in cleanup %s", result)
														
 
															+          .isEqualTo(expectedDirsDeleted);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create and execute a cleanup stage.
														
 
															+   * @param enabled is the stage enabled?
														
 
															+   * @param deleteTaskAttemptDirsInParallel delete task attempt dirs in
														
 
															+   *        parallel?
														
 
															+   * @param suppressExceptions suppress exceptions?
														
 
															+   * @param outcome expected outcome.
														
 
															+   * @param expectedDirsDeleted #of directories deleted. -1 for no checks
														
 
															+   * @return the result
														
 
															+   * @throws IOException non-suppressed exception
														
 
															+   */
														
 
															+  protected CleanupJobStage.Result cleanup(
														
 
															+      final boolean enabled,
														
 
															+      final boolean deleteTaskAttemptDirsInParallel,
														
 
															+      final boolean suppressExceptions,
														
 
															+      final CleanupJobStage.Outcome outcome,
														
 
															+      final int expectedDirsDeleted) throws IOException {
														
 
															+    StageConfig stageConfig = getJobStageConfig();
														
 
															+    CleanupJobStage.Result result = new CleanupJobStage(stageConfig)
														
 
															+        .apply(new CleanupJobStage.Arguments(OP_STAGE_JOB_CLEANUP,
														
 
															+            enabled, deleteTaskAttemptDirsInParallel, suppressExceptions));
														
 
															+    assertCleanupResult(result, outcome, expectedDirsDeleted);
														
 
															+    return result;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Read the UTF_8 text in the file.
														
 
															+   * @param path path to read
														
 
															+   * @return the string
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  protected String readText(final Path path) throws IOException {
														
 
															+
														
 
															+    final FileSystem fs = getFileSystem();
														
 
															+    final FileStatus st = fs.getFileStatus(path);
														
 
															+    Assertions.assertThat(st.getLen())
														
 
															+        .describedAs("length of file %s", st)
														
 
															+        .isLessThanOrEqualTo(MAX_LEN);
														
 
															+
														
 
															+    return new String(
														
 
															+        readDataset(fs, path, (int) st.getLen()),
														
 
															+        StandardCharsets.UTF_8);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Counter.
														
 
															+   */
														
 
															+  protected static final class ProgressCounter implements Progressable {
														
 
															+
														
 
															+    private final AtomicLong counter = new AtomicLong();
														
 
															+
														
 
															+    /**
														
 
															+     * Increment the counter.
														
 
															+     */
														
 
															+    @Override
														
 
															+    public void progress() {
														
 
															+      counter.incrementAndGet();
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Get the counter value.
														
 
															+     * @return the current value.
														
 
															+     */
														
 
															+    public long value() {
														
 
															+      return counter.get();
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Reset the counter.
														
 
															+     */
														
 
															+    public void reset() {
														
 
															+      counter.set(0);
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public String toString() {
														
 
															+      final StringBuilder sb = new StringBuilder(
														
 
															+          "ProgressCounter{");
														
 
															+      sb.append("counter=").append(counter.get());
														
 
															+      sb.append('}');
														
 
															+      return sb.toString();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the progress counter of a stage.
														
 
															+   * @param stageConfig stage
														
 
															+   * @return its progress counter.
														
 
															+   */
														
 
															+  ProgressCounter progressOf(StageConfig stageConfig) {
														
 
															+    return (ProgressCounter) stageConfig.getProgressable();
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/CommitterTestBinding.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/CommitterTestBinding.java
@@ -0,0 +1,152 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import org.apache.hadoop.conf.Configuration;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.mapreduce.JobID;
														
 
															+import org.apache.hadoop.mapreduce.MRJobConfig;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptContext;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptID;
														
 
															+import org.apache.hadoop.mapreduce.TaskID;
														
 
															+import org.apache.hadoop.mapreduce.TaskType;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
														
 
															+import org.apache.hadoop.util.Progressable;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitter.TASK_COMMITTER;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.randomJobId;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createIOStatisticsStore;
														
 
															+
														
 
															+/**
														
 
															+ * This class represents a binding to a job in the target dir with TA, JA
														
 
															+ * and associated paths.
														
 
															+ * It's self contained so as to be usable in any test suite.
														
 
															+ */
														
 
															+class CommitterTestBinding implements
														
 
															+    IOStatisticsSource {
														
 
															+
														
 
															+  /**
														
 
															+   * IOStatistics counter for progress events.
														
 
															+   */
														
 
															+  public static final String PROGRESS_EVENTS = "progress_events";
														
 
															+
														
 
															+  /**
														
 
															+   * IOStatistics to update with progress events.
														
 
															+   */
														
 
															+  private final IOStatisticsStore iostatistics;
														
 
															+
														
 
															+  /**
														
 
															+   * Job attempt ID:.
														
 
															+   */
														
 
															+  private final String jobAttemptId;
														
 
															+
														
 
															+  /**
														
 
															+   * Job ID.
														
 
															+   */
														
 
															+  private final JobID jobId;
														
 
															+
														
 
															+  /**
														
 
															+   * Task Attempt ID, under the job attempt.
														
 
															+   */
														
 
															+  private final TaskAttemptID taskAttemptId;
														
 
															+
														
 
															+  /**
														
 
															+   * Task ID.
														
 
															+   */
														
 
															+  private final TaskID taskId;
														
 
															+
														
 
															+  /**
														
 
															+   * Task attempt context for the given task Attempt.
														
 
															+   */
														
 
															+  private final TaskAttemptContext taskAttemptContext;
														
 
															+
														
 
															+  /**
														
 
															+   * Construct.
														
 
															+   * @param conf job/task config. This is patched with the app attempt.
														
 
															+   * @param appAttempt application attempt.
														
 
															+   * @param taskNumber task number
														
 
															+   * @param taskAttemptNumber which attempt on this task is it
														
 
															+   */
														
 
															+  CommitterTestBinding(
														
 
															+      Configuration conf,
														
 
															+      int appAttempt, int taskNumber, int taskAttemptNumber) {
														
 
															+    iostatistics = createIOStatisticsStore()
														
 
															+        .withCounters(PROGRESS_EVENTS)
														
 
															+        .build();
														
 
															+
														
 
															+
														
 
															+    // this is the job ID, with no attempt info.
														
 
															+    jobId = JobID.forName(randomJobId());
														
 
															+    jobAttemptId = jobId.toString() + "_ " + appAttempt;
														
 
															+    taskId = new TaskID(jobId, TaskType.MAP, taskNumber);
														
 
															+    taskAttemptId = new TaskAttemptID(taskId,
														
 
															+        taskAttemptNumber);
														
 
															+    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, appAttempt);
														
 
															+    taskAttemptContext = new TaskAttemptContextImpl(conf, taskAttemptId);
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a committer config for the given output path.
														
 
															+   * @param outputPath output path in destFS.
														
 
															+   * @return a committer for the active task.
														
 
															+   */
														
 
															+  ManifestCommitterConfig createCommitterConfig(
														
 
															+      Path outputPath) {
														
 
															+    return new ManifestCommitterConfig(outputPath,
														
 
															+        TASK_COMMITTER,
														
 
															+        taskAttemptContext,
														
 
															+        iostatistics,
														
 
															+        null);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a stage config from the committer config.
														
 
															+   * All stats go to the local IOStatisticsStore;
														
 
															+   * there's a progress callback also set to increment
														
 
															+   * the counter {@link #PROGRESS_EVENTS}
														
 
															+   * @return a stage config
														
 
															+   */
														
 
															+  StageConfig createStageConfig(Path outputPath) {
														
 
															+    return createCommitterConfig(outputPath)
														
 
															+        .createStageConfig()
														
 
															+        .withProgressable(new ProgressCallback());
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public IOStatisticsStore getIOStatistics() {
														
 
															+    return iostatistics;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Whenever this progress callback is invoked, the progress_events
														
 
															+   * counter is incremented. This allows for tests to verify that
														
 
															+   * callbacks have occurred by asserting on the event counter.
														
 
															+   */
														
 
															+  private final class ProgressCallback implements Progressable {
														
 
															+
														
 
															+    @Override
														
 
															+    public void progress() {
														
 
															+      iostatistics.incrementCounter(PROGRESS_EVENTS, 1);
														
 
															+    }
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterTestSupport.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterTestSupport.java
@@ -0,0 +1,420 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.ByteArrayOutputStream;
														
 
															+import java.io.File;
														
 
															+import java.io.IOException;
														
 
															+import java.io.PrintStream;
														
 
															+import java.time.LocalDateTime;
														
 
															+import java.time.format.DateTimeFormatter;
														
 
															+import java.util.HashMap;
														
 
															+import java.util.List;
														
 
															+import java.util.Map;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.commons.lang3.StringUtils;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.LocatedFileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.JobID;
														
 
															+import org.apache.hadoop.mapreduce.RecordWriter;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptContext;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptID;
														
 
															+import org.apache.hadoop.mapreduce.TaskID;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestPrinter;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+import org.apache.hadoop.util.functional.RemoteIterators;
														
 
															+
														
 
															+import static org.apache.commons.lang3.StringUtils.isNotEmpty;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_CLASSNAME;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER;
														
 
															+import static org.assertj.core.api.Assertions.assertThat;
														
 
															+
														
 
															+/**
														
 
															+ * Support for committer tests.
														
 
															+ */
														
 
															+public final class ManifestCommitterTestSupport {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      ManifestCommitterTestSupport.class);
														
 
															+
														
 
															+  private static final DateTimeFormatter FORMATTER =
														
 
															+      DateTimeFormatter.ofPattern("yyyyMMddHHmmss");
														
 
															+
														
 
															+  /**
														
 
															+   * Build directory property.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final String PROJECT_BUILD_DIRECTORY_PROPERTY
														
 
															+      = "project.build.directory";
														
 
															+
														
 
															+  /**
														
 
															+   * default number of task attempts for some tests.
														
 
															+   * Value: {@value}.
														
 
															+   */
														
 
															+  public static final int NUMBER_OF_TASK_ATTEMPTS = 200;
														
 
															+
														
 
															+  private ManifestCommitterTestSupport() {
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a random Job ID using the fork ID as part of the number if
														
 
															+   * set in the current process.
														
 
															+   * @return fork ID string in a format parseable by Jobs
														
 
															+   */
														
 
															+  public static String randomJobId() {
														
 
															+    String testUniqueForkId = System.getProperty("test.unique.fork.id", "0001");
														
 
															+    int l = testUniqueForkId.length();
														
 
															+    String trailingDigits = testUniqueForkId.substring(l - 4, l);
														
 
															+    int digitValue;
														
 
															+    try {
														
 
															+      digitValue = Integer.valueOf(trailingDigits);
														
 
															+    } catch (NumberFormatException e) {
														
 
															+      digitValue = 0;
														
 
															+    }
														
 
															+
														
 
															+    return String.format("%s%04d_%04d",
														
 
															+        FORMATTER.format(LocalDateTime.now()),
														
 
															+        (long) (Math.random() * 1000),
														
 
															+        digitValue);
														
 
															+  }
														
 
															+
														
 
															+  public static File getProjectBuildDir() {
														
 
															+    String propval = System.getProperty(PROJECT_BUILD_DIRECTORY_PROPERTY);
														
 
															+    if (StringUtils.isEmpty(propval)) {
														
 
															+      propval = "target";
														
 
															+    }
														
 
															+    return new File(propval).getAbsoluteFile();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load a success file; fail if the file is empty/nonexistent.
														
 
															+   * @param fs filesystem
														
 
															+   * @param outputPath directory containing the success file.
														
 
															+   * @return the loaded file.
														
 
															+   * @throws IOException failure to find/load the file
														
 
															+   * @throws AssertionError file is 0-bytes long,
														
 
															+   */
														
 
															+  public static ManifestSuccessData loadSuccessFile(final FileSystem fs,
														
 
															+      final Path outputPath) throws IOException {
														
 
															+    Path success = new Path(outputPath, SUCCESS_MARKER);
														
 
															+    return ManifestSuccessData.load(fs, success);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load in the success data marker.
														
 
															+   * @param fs filesystem
														
 
															+   * @param outputDir ouptu path of job
														
 
															+   * @param minimumFileCount minimum number of files to have been created
														
 
															+   * @param jobId job ID, only verified if non-empty
														
 
															+   * @return the success data
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  public static ManifestSuccessData validateSuccessFile(
														
 
															+      final FileSystem fs,
														
 
															+      final Path outputDir,
														
 
															+      final int minimumFileCount,
														
 
															+      final String jobId) throws IOException {
														
 
															+    Path successPath = new Path(outputDir, SUCCESS_MARKER);
														
 
															+    ManifestSuccessData successData
														
 
															+        = loadAndPrintSuccessData(fs, successPath);
														
 
															+    assertThat(successData.getCommitter())
														
 
															+        .describedAs("Committer field")
														
 
															+        .isEqualTo(MANIFEST_COMMITTER_CLASSNAME);
														
 
															+    assertThat(successData.getFilenames())
														
 
															+        .describedAs("Files committed")
														
 
															+        .hasSizeGreaterThanOrEqualTo(minimumFileCount);
														
 
															+    if (isNotEmpty(jobId)) {
														
 
															+      assertThat(successData.getJobId())
														
 
															+          .describedAs("JobID")
														
 
															+          .isEqualTo(jobId);
														
 
															+    }
														
 
															+    return successData;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load in and print a success data manifest.
														
 
															+   * @param fs filesystem
														
 
															+   * @param successPath full path to success file.
														
 
															+   * @return the success data
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  public static ManifestSuccessData loadAndPrintSuccessData(
														
 
															+      FileSystem fs,
														
 
															+      Path successPath) throws IOException {
														
 
															+    LOG.info("Manifest {}", successPath);
														
 
															+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
														
 
															+    PrintStream ps = new PrintStream(baos);
														
 
															+    final ManifestPrinter showManifest = new ManifestPrinter(fs.getConf(), ps);
														
 
															+    ManifestSuccessData successData = showManifest.loadAndPrintManifest(fs, successPath);
														
 
															+    LOG.info("{}", baos);
														
 
															+    return successData;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Validate all generated files from the manifest.
														
 
															+   * All files in the manifest must exist.
														
 
															+   * If the exclusive flag is set, only those must exist
														
 
															+   * (ignoring all temp files and everything in the _temporary
														
 
															+   * dir)
														
 
															+   * @param fs filesystem
														
 
															+   * @param destDir dest dir to scan
														
 
															+   * @param successData manifest
														
 
															+   * @param exclusive expect exclusive and complete data.
														
 
															+   * @return the files and their status
														
 
															+   * @throws IOException IO failure.
														
 
															+   */
														
 
															+  public static Map<Path, LocatedFileStatus> validateGeneratedFiles(
														
 
															+      FileSystem fs,
														
 
															+      Path destDir,
														
 
															+      ManifestSuccessData successData,
														
 
															+      boolean exclusive) throws IOException {
														
 
															+    Map<Path, LocatedFileStatus> map = new HashMap<>();
														
 
															+    RemoteIterators.foreach(fs.listFiles(destDir, true),
														
 
															+        e -> {
														
 
															+          if (!e.getPath().getName().startsWith("_")) {
														
 
															+            map.put(e.getPath(), e);
														
 
															+          }
														
 
															+        });
														
 
															+    // map has all files other than temp ones and the success marker
														
 
															+    // what do we expect
														
 
															+    final List<Path> expected = filesInManifest(successData);
														
 
															+
														
 
															+    // all of those must be found
														
 
															+    Assertions.assertThat(map.keySet())
														
 
															+        .describedAs("Files in FS compared to manifest")
														
 
															+        .containsAll(expected);
														
 
															+
														
 
															+    // and if exclusive, that too
														
 
															+    if (exclusive) {
														
 
															+      Assertions.assertThat(map.keySet())
														
 
															+          .describedAs("Files in FS compared to manifest")
														
 
															+          .containsExactlyInAnyOrderElementsOf(expected);
														
 
															+    }
														
 
															+    return map;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Given a manifest, get the list of filenames
														
 
															+   * and convert to paths.
														
 
															+   * @param successData data
														
 
															+   * @return the paths.
														
 
															+   */
														
 
															+  public static List<Path> filesInManifest(ManifestSuccessData successData) {
														
 
															+    return successData.getFilenames().stream()
														
 
															+        .map(AbstractManifestData::unmarshallPath)
														
 
															+        .collect(Collectors.toList());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * List a directory/directory tree.
														
 
															+   * @param fileSystem FS
														
 
															+   * @param path path
														
 
															+   * @param recursive do a recursive listing?
														
 
															+   * @return the number of files found.
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  public static long lsR(FileSystem fileSystem, Path path, boolean recursive)
														
 
															+      throws Exception {
														
 
															+    if (path == null) {
														
 
															+      // surfaces when someone calls getParent() on something at the top
														
 
															+      // of the path
														
 
															+      LOG.info("Empty path");
														
 
															+      return 0;
														
 
															+    }
														
 
															+    return RemoteIterators.foreach(fileSystem.listFiles(path, recursive),
														
 
															+        (status) -> LOG.info("{}", status));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that a file or dir entry matches the given parameters.
														
 
															+   * Matching on paths, not strings, helps validate marshalling.
														
 
															+   * @param fileOrDir file or directory
														
 
															+   * @param src source path
														
 
															+   * @param dest dest path
														
 
															+   * @param l length
														
 
															+   */
														
 
															+  static void assertFileEntryMatch(
														
 
															+      final FileEntry fileOrDir,
														
 
															+      final Path src,
														
 
															+      final Path dest,
														
 
															+      final long l) {
														
 
															+    String entry = fileOrDir.toString();
														
 
															+    assertThat(fileOrDir.getSourcePath())
														
 
															+        .describedAs("Source path of " + entry)
														
 
															+        .isEqualTo(src);
														
 
															+    assertThat(fileOrDir.getDestPath())
														
 
															+        .describedAs("Dest path of " + entry)
														
 
															+        .isEqualTo(dest);
														
 
															+    assertThat(fileOrDir.getSize())
														
 
															+        .describedAs("Size of " + entry)
														
 
															+        .isEqualTo(l);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that a dir entry matches the given parameters.
														
 
															+   * Matching on paths, not strings, helps validate marshalling.
														
 
															+   * @param fileOrDir file or directory
														
 
															+   * @param dest dest path
														
 
															+   * @param type type
														
 
															+   */
														
 
															+  static void assertDirEntryMatch(
														
 
															+      final DirEntry fileOrDir,
														
 
															+      final Path dest,
														
 
															+      final long type) {
														
 
															+    String entry = fileOrDir.toString();
														
 
															+    assertThat(fileOrDir.getDestPath())
														
 
															+        .describedAs("Dest path of " + entry)
														
 
															+        .isEqualTo(dest);
														
 
															+    assertThat(fileOrDir.getType())
														
 
															+        .describedAs("type of " + entry)
														
 
															+        .isEqualTo(type);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Closeable which can be used to safely close writers in
														
 
															+   * a try-with-resources block..
														
 
															+   */
														
 
															+  public static final class CloseWriter<K, V> implements AutoCloseable {
														
 
															+
														
 
															+    private final RecordWriter<K, V> writer;
														
 
															+
														
 
															+    private final TaskAttemptContext context;
														
 
															+
														
 
															+    public CloseWriter(RecordWriter<K, V> writer,
														
 
															+        TaskAttemptContext context) {
														
 
															+      this.writer = writer;
														
 
															+      this.context = context;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public void close() {
														
 
															+      try {
														
 
															+        writer.close(context);
														
 
															+      } catch (IOException | InterruptedException e) {
														
 
															+        LOG.error("When closing {} on context {}",
														
 
															+            writer, context, e);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  public static final String ATTEMPT_STRING =
														
 
															+      "attempt_%s_m_%06d_%d";
														
 
															+
														
 
															+  /**
														
 
															+   * Creates a random JobID and then as many tasks
														
 
															+   * with the specific number of task attempts.
														
 
															+   */
														
 
															+  public static final class JobAndTaskIDsForTests {
														
 
															+
														
 
															+    /** Job ID; will be created uniquely for each instance. */
														
 
															+    private final String jobId;
														
 
															+
														
 
															+    /**
														
 
															+     * Store the details as strings; generate
														
 
															+     * IDs on demand.
														
 
															+     */
														
 
															+    private final String[][] taskAttempts;
														
 
															+
														
 
															+    /**
														
 
															+     * Constructor.
														
 
															+     * @param tasks number of tasks.
														
 
															+     * @param attempts number of attempts.
														
 
															+     */
														
 
															+    public JobAndTaskIDsForTests(int tasks, int attempts) {
														
 
															+      this(randomJobId(), tasks, attempts);
														
 
															+    }
														
 
															+
														
 
															+    public JobAndTaskIDsForTests(final String jobId,
														
 
															+        int tasks, int attempts) {
														
 
															+      this.jobId = jobId;
														
 
															+      this.taskAttempts = new String[tasks][attempts];
														
 
															+      for (int i = 0; i < tasks; i++) {
														
 
															+        for (int j = 0; j < attempts; j++) {
														
 
															+          String a = String.format(ATTEMPT_STRING,
														
 
															+              jobId, i, j);
														
 
															+          this.taskAttempts[i][j] = a;
														
 
															+        }
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Get the job ID.
														
 
															+     * @return job ID string.
														
 
															+     */
														
 
															+    public String getJobId() {
														
 
															+      return jobId;
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Get the job ID as the MR type.
														
 
															+     * @return job ID type.
														
 
															+     */
														
 
															+    public JobID getJobIdType() {
														
 
															+      return getTaskIdType(0).getJobID();
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Get a task attempt ID.
														
 
															+     * @param task task index
														
 
															+     * @param attempt attempt number.
														
 
															+     * @return the task attempt.
														
 
															+     */
														
 
															+    public String getTaskAttempt(int task, int attempt) {
														
 
															+      return taskAttempts[task][attempt];
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Get task attempt ID as the MR type.
														
 
															+     * @param task task index
														
 
															+     * @param attempt attempt number.
														
 
															+     * @return the task attempt type
														
 
															+     */
														
 
															+    public TaskAttemptID getTaskAttemptIdType(int task, int attempt) {
														
 
															+      return TaskAttemptID.forName(getTaskAttempt(task, attempt));
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Get task ID as the MR type.
														
 
															+     * @param task task index
														
 
															+     * @return the task ID type
														
 
															+     */
														
 
															+    public TaskID getTaskIdType(int task) {
														
 
															+      return TaskAttemptID.forName(getTaskAttempt(task, 0)).getTaskID();
														
 
															+    }
														
 
															+
														
 
															+    /**
														
 
															+     * Get task ID as a string.
														
 
															+     * @param task task index
														
 
															+     * @return the task ID
														
 
															+     */
														
 
															+    public String getTaskId(int task) {
														
 
															+      return getTaskIdType(task).toString();
														
 
															+    }
														
 
															+
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/StubStoreOperations.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/StubStoreOperations.java
@@ -0,0 +1,109 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.NoSuchElementException;
														
 
															+
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.RemoteIterator;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
														
 
															+import org.apache.hadoop.util.JsonSerialization;
														
 
															+
														
 
															+/**
														
 
															+ * Stub Store operations.
														
 
															+ * Everything "works" provided you don't look too close.
														
 
															+ * Files have etags of their filename; if you move a file without changing its
														
 
															+ * name, the etag is preserved.
														
 
															+ */
														
 
															+public class StubStoreOperations extends ManifestStoreOperations {
														
 
															+
														
 
															+  @Override
														
 
															+  public FileStatus getFileStatus(final Path path) throws IOException {
														
 
															+    return new TaggedFileStatus(0, false, 1, 1024, 0, path, path.getName());
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean delete(final Path path, final boolean recursive)
														
 
															+      throws IOException {
														
 
															+    return true;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean mkdirs(final Path path) throws IOException {
														
 
															+    return true;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean renameFile(final Path source, final Path dest)
														
 
															+      throws IOException {
														
 
															+    return true;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public RemoteIterator<FileStatus> listStatusIterator(final Path path)
														
 
															+      throws IOException {
														
 
															+    return new EmptyRemoteIterator<>();
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public TaskManifest loadTaskManifest(JsonSerialization<TaskManifest> serializer,
														
 
															+      final FileStatus st) throws IOException {
														
 
															+    return new TaskManifest();
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public <T extends AbstractManifestData<T>> void save(T manifestData,
														
 
															+      final Path path,
														
 
															+      final boolean overwrite) throws IOException {
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void close() throws IOException {
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean storePreservesEtagsThroughRenames(final Path path) {
														
 
															+    return true;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Always empty rempte iterator.
														
 
															+   * @param <T> type of iterator.
														
 
															+   */
														
 
															+  private static final class EmptyRemoteIterator<T>
														
 
															+      implements RemoteIterator<T> {
														
 
															+
														
 
															+    @Override
														
 
															+    public boolean hasNext() throws IOException {
														
 
															+      return false;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public T next() throws IOException {
														
 
															+      throw new NoSuchElementException();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TaggedFileStatus.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TaggedFileStatus.java
@@ -0,0 +1,54 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.apache.hadoop.fs.EtagSource;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+
														
 
															+/**
														
 
															+ * File Status with etag derived from the filename, if not explicitly set.
														
 
															+ */
														
 
															+public final class TaggedFileStatus extends FileStatus implements EtagSource {
														
 
															+
														
 
															+  private final String etag;
														
 
															+
														
 
															+  public TaggedFileStatus(final long length,
														
 
															+      final boolean isdir,
														
 
															+      final int blockReplication,
														
 
															+      final long blocksize,
														
 
															+      final long modificationTime,
														
 
															+      final Path path,
														
 
															+      final String etag) {
														
 
															+    super(length, isdir, blockReplication, blocksize, modificationTime, path);
														
 
															+    this.etag = etag;
														
 
															+  }
														
 
															+
														
 
															+  public TaggedFileStatus(final FileStatus other, final String etag) throws IOException {
														
 
															+    super(other);
														
 
															+    this.etag = etag;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public String getEtag() {
														
 
															+    return etag;
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCleanupStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCleanupStage.java
@@ -0,0 +1,142 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.util.List;
														
 
															+
														
 
															+import org.junit.Test;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ * Test the cleanup stage.
														
 
															+ */
														
 
															+public class TestCleanupStage extends AbstractManifestCommitterTest {
														
 
															+
														
 
															+  /**
														
 
															+   * Number of task attempts to create. Manifests are created and written
														
 
															+   * as well as test dirs, but no actual files.
														
 
															+   */
														
 
															+  protected static final int TASK_ATTEMPT_COUNT = 10;
														
 
															+
														
 
															+  /**
														
 
															+   * How many delete calls for the root job delete?
														
 
															+   */
														
 
															+  protected static final int ROOT_DELETE_COUNT = 1;
														
 
															+
														
 
															+  /**
														
 
															+   * Tocal invocation count for a successful parallel delete job.
														
 
															+   */
														
 
															+  protected static final int PARALLEL_DELETE_COUNT =
														
 
															+      TASK_ATTEMPT_COUNT + ROOT_DELETE_COUNT;
														
 
															+
														
 
															+  /**
														
 
															+   * Fault Injection.
														
 
															+   */
														
 
															+  private UnreliableManifestStoreOperations failures;
														
 
															+
														
 
															+  /**
														
 
															+   * Manifests created.
														
 
															+   */
														
 
															+  private List<TaskManifest> manifests;
														
 
															+
														
 
															+  @Override
														
 
															+  public void setup() throws Exception {
														
 
															+    super.setup();
														
 
															+    failures = new UnreliableManifestStoreOperations(
														
 
															+        createManifestStoreOperations());
														
 
															+    setStoreOperations(failures);
														
 
															+    Path destDir = methodPath();
														
 
															+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
														
 
															+    setJobStageConfig(stageConfig);
														
 
															+    new SetupJobStage(stageConfig).apply(true);
														
 
															+
														
 
															+    // lots of tasks, but don't bother creating mock files.
														
 
															+    manifests = executeTaskAttempts(TASK_ATTEMPT_COUNT, 0);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testCleanupInParallelHealthy() throws Throwable {
														
 
															+    describe("parallel cleanup of TA dirs.");
														
 
															+    cleanup(true, true, false,
														
 
															+        CleanupJobStage.Outcome.PARALLEL_DELETE,
														
 
															+        PARALLEL_DELETE_COUNT);
														
 
															+    verifyJobDirsCleanedUp();
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testCleanupSingletonHealthy() throws Throwable {
														
 
															+    describe("Cleanup with a single delete. Not the default; would be best on HDFS");
														
 
															+
														
 
															+    cleanup(true, false, false,
														
 
															+        CleanupJobStage.Outcome.DELETED, ROOT_DELETE_COUNT);
														
 
															+    verifyJobDirsCleanedUp();
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testCleanupNoDir() throws Throwable {
														
 
															+    describe("parallel cleanup MUST not fail if there's no dir");
														
 
															+    // first do the cleanup
														
 
															+    cleanup(true, true, false,
														
 
															+        CleanupJobStage.Outcome.PARALLEL_DELETE, PARALLEL_DELETE_COUNT);
														
 
															+
														
 
															+    // now expect cleanup by single delete still works
														
 
															+    // the delete count is 0 as pre check skips it
														
 
															+    cleanup(true, false, false,
														
 
															+        CleanupJobStage.Outcome.NOTHING_TO_CLEAN_UP, 0);
														
 
															+
														
 
															+    // if skipped, that happens first
														
 
															+    cleanup(false, true, false,
														
 
															+        CleanupJobStage.Outcome.DISABLED, 0);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testFailureInParallelDelete() throws Throwable {
														
 
															+    describe("A parallel delete fails, but the base delete works");
														
 
															+
														
 
															+    // pick one of the manifests
														
 
															+    TaskManifest manifest = manifests.get(4);
														
 
															+    Path taPath = new Path(manifest.getTaskAttemptDir());
														
 
															+    failures.addDeletePathToFail(taPath);
														
 
															+    cleanup(true, true, false,
														
 
															+        CleanupJobStage.Outcome.DELETED, PARALLEL_DELETE_COUNT);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * If there's no job task attempt subdir then the list of it will raise
														
 
															+   * and FNFE; this MUST be caught and the base delete executed.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testParallelDeleteNoTaskAttemptDir() throws Throwable {
														
 
															+    describe("Execute parallel delete where" +
														
 
															+        " the job task directory does not exist");
														
 
															+    StageConfig stageConfig = getJobStageConfig();
														
 
															+    // TA dir doesn't exist, so listing will fail.
														
 
															+    failures.addPathNotFound(stageConfig.getJobAttemptTaskSubDir());
														
 
															+    cleanup(true, true, false,
														
 
															+        CleanupJobStage.Outcome.DELETED, ROOT_DELETE_COUNT);
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCommitTaskStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCommitTaskStage.java
@@ -0,0 +1,126 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.FileNotFoundException;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+import org.junit.Test;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitTaskStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
														
 
															+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
														
 
															+
														
 
															+/**
														
 
															+ * Test committing a task.
														
 
															+ */
														
 
															+public class TestCommitTaskStage extends AbstractManifestCommitterTest {
														
 
															+
														
 
															+  @Override
														
 
															+  public void setup() throws Exception {
														
 
															+    super.setup();
														
 
															+
														
 
															+    Path destDir = methodPath();
														
 
															+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
														
 
															+    setJobStageConfig(stageConfig);
														
 
															+    new SetupJobStage(stageConfig).apply(true);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testCommitMissingDirectory() throws Throwable {
														
 
															+
														
 
															+    String tid = String.format("task_%03d", 1);
														
 
															+    String taskAttemptId = String.format("%s_%02d",
														
 
															+        tid, 1);
														
 
															+    StageConfig taskStageConfig = createTaskStageConfig(JOB1, tid,
														
 
															+        taskAttemptId);
														
 
															+
														
 
															+    // the task attempt dir does not exist
														
 
															+    Path taDir = taskStageConfig.getTaskAttemptDir();
														
 
															+    assertPathDoesNotExist("task attempt path", taDir);
														
 
															+
														
 
															+    // so the task commit fails
														
 
															+    intercept(FileNotFoundException.class, () ->
														
 
															+        new CommitTaskStage(taskStageConfig).apply(null));
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testCommitEmptyDirectory() throws Throwable {
														
 
															+
														
 
															+    describe("Commit an empty directory as task then job");
														
 
															+    String tid = String.format("task_%03d", 2);
														
 
															+    String taskAttemptId = String.format("%s_%02d",
														
 
															+        tid, 1);
														
 
															+    StageConfig taskStageConfig = createTaskStageConfig(JOB1, tid,
														
 
															+        taskAttemptId);
														
 
															+
														
 
															+    // set up the task
														
 
															+    new SetupTaskStage(taskStageConfig).apply("setup");
														
 
															+
														
 
															+    CommitTaskStage.Result result = new CommitTaskStage(taskStageConfig)
														
 
															+        .apply(null);
														
 
															+
														
 
															+    final TaskManifest manifest = result.getTaskManifest();
														
 
															+    Assertions.assertThat(manifest.getDestDirectories())
														
 
															+        .as("directories to create")
														
 
															+        .isEmpty();
														
 
															+    Assertions.assertThat(manifest.getFilesToCommit())
														
 
															+        .as("files to commit")
														
 
															+        .isEmpty();
														
 
															+
														
 
															+    final Path path = result.getPath();
														
 
															+
														
 
															+    final String manifestBody = readText(path);
														
 
															+
														
 
															+    LOG.info("manifest at {} of length {}:\n{}",
														
 
															+        path, manifestBody.length(), manifestBody);
														
 
															+
														
 
															+    // now commit
														
 
															+    final CommitJobStage.Result outcome = new CommitJobStage(getJobStageConfig())
														
 
															+        .apply(new CommitJobStage.Arguments(
														
 
															+            true, true, null,
														
 
															+            new CleanupJobStage.Arguments(
														
 
															+                OP_STAGE_JOB_CLEANUP,
														
 
															+                true,
														
 
															+                true,
														
 
															+                false
														
 
															+            )));
														
 
															+
														
 
															+    // review success file
														
 
															+    final Path successPath = outcome.getSuccessPath();
														
 
															+    String successBody = readText(successPath);
														
 
															+    LOG.info("successBody at {} of length {}:\n{}",
														
 
															+        successPath, successBody.length(), successBody);
														
 
															+
														
 
															+    final ManifestSuccessData successData = outcome.getJobSuccessData();
														
 
															+    Assertions.assertThat(successData.getFilenames())
														
 
															+        .as("Filenames in _SUCCESS")
														
 
															+        .isEmpty();
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCreateOutputDirectoriesStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCreateOutputDirectoriesStage.java
@@ -0,0 +1,307 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.Collection;
														
 
															+import java.util.List;
														
 
															+import java.util.concurrent.CompletableFuture;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+import org.junit.Test;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.EntryStatus;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CreateOutputDirectoriesStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+import org.apache.hadoop.util.Lists;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.lookupCounterStatistic;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_GET_FILE_STATUS;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_IS_FILE;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_MKDIRS;
														
 
															+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_FAILURES;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_DELETE_FILE_UNDER_DESTINATION;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_PREPARE_DIR_ANCESTORS;
														
 
															+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
														
 
															+
														
 
															+/**
														
 
															+ * Test directory creation.
														
 
															+ * As the directory creation phase relies on input from the task manifest to
														
 
															+ * determine which directories to explicitly create, delete files at
														
 
															+ * etc, these tests build up manifests and assert that the output
														
 
															+ * of the directory creation stage matches that of the combination
														
 
															+ * of the manifest and the filesystem state.
														
 
															+ */
														
 
															+public class TestCreateOutputDirectoriesStage extends AbstractManifestCommitterTest {
														
 
															+
														
 
															+  /**
														
 
															+   * Deep tree width, subclasses (including in external projects)
														
 
															+   * may change.
														
 
															+   */
														
 
															+  protected static final int DEEP_TREE_WIDTH = 4;
														
 
															+
														
 
															+  /**
														
 
															+   * The number of directories created in test setup; this must be
														
 
															+   * added to all assertions of the value of OP_MKDIRS.
														
 
															+   */
														
 
															+  private static final int DIRECTORIES_CREATED_IN_SETUP = 2;
														
 
															+
														
 
															+  private Path destDir;
														
 
															+  private CreateOutputDirectoriesStage mkdirStage;
														
 
															+  private StageConfig stageConfig;
														
 
															+  private IOStatisticsStore iostats;
														
 
															+
														
 
															+  @Override
														
 
															+  public void setup() throws Exception {
														
 
															+    super.setup();
														
 
															+    destDir = methodPath();
														
 
															+    // clean up dest dir completely
														
 
															+    destDir.getFileSystem(getConfiguration()).delete(destDir, true);
														
 
															+    setStoreOperations(createManifestStoreOperations());
														
 
															+    stageConfig = createStageConfigForJob(JOB1, destDir)
														
 
															+        .withDeleteTargetPaths(true);
														
 
															+    setJobStageConfig(stageConfig);
														
 
															+    // creates the job directories.
														
 
															+    new SetupJobStage(stageConfig).apply(true);
														
 
															+    mkdirStage = new CreateOutputDirectoriesStage(stageConfig);
														
 
															+    iostats = stageConfig.getIOStatistics();
														
 
															+    // assert original count of dirs created == 2 : job and task manifest
														
 
															+    verifyStatisticCounterValue(iostats, OP_MKDIRS,
														
 
															+        DIRECTORIES_CREATED_IN_SETUP);
														
 
															+    // reset the value to simplify future work
														
 
															+    iostats.getCounterReference(OP_MKDIRS).set(0);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testPrepareSomeDirs() throws Throwable {
														
 
															+
														
 
															+    final long initialFileStatusCount = lookupCounterStatistic(iostats, OP_GET_FILE_STATUS);
														
 
															+    final int dirCount = 8;
														
 
															+    final List<Path> dirs = subpaths(destDir, dirCount);
														
 
															+    final List<DirEntry> dirEntries = dirEntries(dirs, 1, EntryStatus.not_found);
														
 
															+
														
 
															+    // two manifests with duplicate entries
														
 
															+    final List<TaskManifest> manifests = Lists.newArrayList(
														
 
															+        manifestWithDirsToCreate(dirEntries),
														
 
															+        manifestWithDirsToCreate(dirEntries));
														
 
															+    final CreateOutputDirectoriesStage.Result result = mkdirStage.apply(manifests);
														
 
															+    Assertions.assertThat(result.getCreatedDirectories())
														
 
															+        .describedAs("output of %s", mkdirStage)
														
 
															+        .containsExactlyInAnyOrderElementsOf(dirs);
														
 
															+
														
 
															+    LOG.info("Job Statistics\n{}", ioStatisticsToPrettyString(iostats));
														
 
															+
														
 
															+    // now dirCount new dirs are added.
														
 
															+    verifyStatisticCounterValue(iostats, OP_MKDIRS, dirCount);
														
 
															+
														
 
															+    // now rerun the same preparation sequence, but this
														
 
															+    // time declare that the directories exist (as they do)
														
 
															+    final CreateOutputDirectoriesStage s2 =
														
 
															+        new CreateOutputDirectoriesStage(stageConfig);
														
 
															+    final CreateOutputDirectoriesStage.Result r2 = s2.apply(
														
 
															+        Lists.newArrayList(
														
 
															+            manifestWithDirsToCreate(dirEntries(dirs, 1, EntryStatus.dir))));
														
 
															+
														
 
															+    // no directories are now created.
														
 
															+    Assertions.assertThat(r2.getCreatedDirectories())
														
 
															+        .describedAs("output of %s", s2)
														
 
															+        .isEmpty();
														
 
															+    LOG.info("Job Statistics after second pass\n{}", ioStatisticsToPrettyString(iostats));
														
 
															+
														
 
															+    // second run probed no dest dirs
														
 
															+    verifyStatisticCounterValue(iostats, OP_GET_FILE_STATUS, initialFileStatusCount);
														
 
															+    // and no new mkdir calls were made
														
 
															+    verifyStatisticCounterValue(iostats, OP_MKDIRS, dirCount);
														
 
															+    verifyStatisticCounterValue(iostats, OP_DELETE_FILE_UNDER_DESTINATION, 0);
														
 
															+    verifyStatisticCounterValue(iostats, OP_IS_FILE, 0);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Given a list of paths, build a list of DirEntry entries.
														
 
															+   * @param paths list of paths
														
 
															+   * @param level Level in the treewalk.
														
 
															+   * @param entryStatus status of dirs
														
 
															+   * @return list of entries with  the given level and entry status.
														
 
															+   */
														
 
															+  protected List<DirEntry> dirEntries(Collection<Path> paths,
														
 
															+      int level,
														
 
															+      EntryStatus entryStatus) {
														
 
															+    return paths.stream()
														
 
															+        .map(p -> DirEntry.dirEntry(p, entryStatus, level))
														
 
															+        .collect(Collectors.toList());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a manifest with the list of directory entries added.
														
 
															+   * Job commit requires the entries to have been probed for, and
														
 
															+   * for the entire tree under the dest path to be included.
														
 
															+   * @param dirEntries list of directory entries.
														
 
															+   * @return the manifest.
														
 
															+   */
														
 
															+  protected TaskManifest manifestWithDirsToCreate(List<DirEntry> dirEntries) {
														
 
															+    final TaskManifest taskManifest = new TaskManifest();
														
 
															+    taskManifest.getDestDirectories().addAll(dirEntries);
														
 
															+    return taskManifest;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert the directory map status of a path.
														
 
															+   * @param result stage result
														
 
															+   * @param path path to look up
														
 
															+   * @param expected expected value.
														
 
															+   */
														
 
															+  private static void assertDirMapStatus(
														
 
															+      CreateOutputDirectoriesStage.Result result,
														
 
															+      Path path,
														
 
															+      CreateOutputDirectoriesStage.DirMapState expected) {
														
 
															+    Assertions.assertThat(result.getDirMap())
														
 
															+        .describedAs("Directory Map entry for %s", path)
														
 
															+        .isNotNull()
														
 
															+        .containsKey(path)
														
 
															+        .containsEntry(path, expected);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Prepare a deep tree {@code c ^ 3} of entries.
														
 
															+   * Make one of the parent dirs a file.
														
 
															+   *
														
 
															+   * From a test-purity perspective, this should
														
 
															+   * be separate tests. But attempting
														
 
															+   * operations in the same test cases spreads the
														
 
															+   * directory setup costs across both, rather than
														
 
															+   * duplicating it.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testPrepareDirtyTree() throws Throwable {
														
 
															+
														
 
															+    // build the lists of paths for the different levels
														
 
															+    final int c = getDeepTreeWidth();
														
 
															+    final List<Path> level1 = subpaths(destDir, c);
														
 
															+    final List<Path> level2 = level1.stream().flatMap(p ->
														
 
															+            subpaths(p, c).stream())
														
 
															+        .collect(Collectors.toList());
														
 
															+    final List<Path> level3 = level2.stream().flatMap(p ->
														
 
															+            subpaths(p, c).stream())
														
 
															+        .collect(Collectors.toList());
														
 
															+    // manifest dir entry list contains all levels > 0
														
 
															+    // adding them out of order verifies sorting takes place
														
 
															+    // before the merge routine which is intended to strip
														
 
															+    // out parent dirs
														
 
															+    final List<DirEntry> directories = new ArrayList<>();
														
 
															+    final List<DirEntry> l1 = dirEntries(level1, 1, EntryStatus.not_found);
														
 
															+    directories.addAll(l1);
														
 
															+    final List<DirEntry> l3 = dirEntries(level3, 3, EntryStatus.not_found);
														
 
															+    directories.addAll(l3);
														
 
															+    final List<DirEntry> l2 = dirEntries(level2, 2, EntryStatus.not_found);
														
 
															+    directories.addAll(l2);
														
 
															+
														
 
															+    // one of the level 0 paths is going to be a file
														
 
															+    final DirEntry parentIsFile = l1.get(1);
														
 
															+    // one entry has a dir already
														
 
															+    final DirEntry parentIsDir = l2.get(0);
														
 
															+    // and one of the dest dirs is a file.
														
 
															+    final DirEntry leafIsFile = l3.get(0);
														
 
															+
														
 
															+    // prepare the output
														
 
															+    CompletableFuture.allOf(
														
 
															+        asyncPut(parentIsFile.getDestPath(), NO_DATA),
														
 
															+        asyncPut(leafIsFile.getDestPath(), NO_DATA),
														
 
															+        asyncMkdir(parentIsDir.getDestPath()))
														
 
															+        .join();
														
 
															+
														
 
															+    // patch the entries, which, as they are references
														
 
															+    // into the lists, updates the values there.
														
 
															+    parentIsFile.setStatus(EntryStatus.file);
														
 
															+    parentIsDir.setStatus(EntryStatus.dir);
														
 
															+    leafIsFile.setStatus(EntryStatus.file);
														
 
															+
														
 
															+    final List<TaskManifest> manifests = Lists.newArrayList(
														
 
															+        manifestWithDirsToCreate(directories));
														
 
															+
														
 
															+    // first attempt will succeed.
														
 
															+    final CreateOutputDirectoriesStage.Result result =
														
 
															+        mkdirStage.apply(manifests);
														
 
															+
														
 
															+    LOG.info("Job Statistics\n{}", ioStatisticsToPrettyString(iostats));
														
 
															+
														
 
															+    assertDirMapStatus(result, leafIsFile.getDestPath(),
														
 
															+        CreateOutputDirectoriesStage.DirMapState.fileNowDeleted);
														
 
															+
														
 
															+    // for the parent dir, all is good
														
 
															+    assertDirMapStatus(result, parentIsFile.getDestPath(),
														
 
															+        CreateOutputDirectoriesStage.DirMapState.fileNowDeleted);
														
 
															+    Assertions.assertThat(result.getCreatedDirectories())
														
 
															+        .describedAs("output of %s", mkdirStage)
														
 
															+        .containsExactlyInAnyOrderElementsOf(level3);
														
 
															+    verifyStatisticCounterValue(iostats, OP_MKDIRS, level3.size());
														
 
															+    // do a rerun. where the directory setup will fail because
														
 
															+    // a directory is present where the manifest says there is
														
 
															+    // a file.
														
 
															+    CreateOutputDirectoriesStage attempt2 =
														
 
															+        new CreateOutputDirectoriesStage(
														
 
															+            createStageConfigForJob(JOB1, destDir)
														
 
															+                .withDeleteTargetPaths(true));
														
 
															+    // attempt will fail because one of the entries marked as
														
 
															+    // a file to delete is now a non-empty directory
														
 
															+    LOG.info("Executing failing attempt to create the directories");
														
 
															+    intercept(IOException.class, () -> attempt2.apply(manifests));
														
 
															+    verifyStatisticCounterValue(iostats, OP_PREPARE_DIR_ANCESTORS + SUFFIX_FAILURES, 1);
														
 
															+    verifyStatisticCounterValue(iostats, OP_DELETE + SUFFIX_FAILURES, 1);
														
 
															+
														
 
															+    // build a new directory list where everything is declared a directory;
														
 
															+    // no dirs will be created this time as they all exist.
														
 
															+    final List<DirEntry> directories3 = new ArrayList<>();
														
 
															+    directories3.addAll(dirEntries(level1, 1, EntryStatus.dir));
														
 
															+    directories3.addAll(dirEntries(level2, 2, EntryStatus.dir));
														
 
															+    directories3.addAll(dirEntries(level3, 3, EntryStatus.dir));
														
 
															+
														
 
															+    final List<TaskManifest> manifests3 = Lists.newArrayList(
														
 
															+        manifestWithDirsToCreate(directories3));
														
 
															+    CreateOutputDirectoriesStage attempt3 =
														
 
															+        new CreateOutputDirectoriesStage(
														
 
															+            createStageConfigForJob(JOB1, destDir)
														
 
															+                .withDeleteTargetPaths(true));
														
 
															+    final CreateOutputDirectoriesStage.Result r3 =
														
 
															+        attempt3.apply(manifests3);
														
 
															+    assertDirMapStatus(r3, leafIsFile.getDestPath(),
														
 
															+        CreateOutputDirectoriesStage.DirMapState.dirFoundInStore);
														
 
															+    Assertions.assertThat(r3.getCreatedDirectories())
														
 
															+        .describedAs("created directories")
														
 
															+        .isEmpty();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the width of the deep tree; subclasses may tune for test performance, though
														
 
															+   * a wide one is more realistic of real jobs.
														
 
															+   * @return number of subdirs to create at each level. Must be at least 2
														
 
															+   */
														
 
															+  protected int getDeepTreeWidth() {
														
 
															+    return DEEP_TREE_WIDTH;
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestJobThroughManifestCommitter.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestJobThroughManifestCommitter.java
@@ -0,0 +1,601 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.FileNotFoundException;
														
 
															+import java.io.IOException;
														
 
															+import java.util.List;
														
 
															+import java.util.Map;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+import org.junit.FixMethodOrder;
														
 
															+import org.junit.Test;
														
 
															+import org.junit.runners.MethodSorters;
														
 
															+
														
 
															+import org.apache.hadoop.fs.FileAlreadyExistsException;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.OutputValidationException;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.AbortTaskStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitTaskStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.LoadManifestsStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.ValidateRenamedFilesStage;
														
 
															+import org.apache.hadoop.net.NetUtils;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.contract.ContractTestUtils.rm;
														
 
															+import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyPathExists;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertThatStatisticCounter;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_ID_SOURCE_MAPREDUCE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_CLASSNAME;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_BYTES_COMMITTED_COUNT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_FILES_COMMITTED_COUNT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.loadAndPrintSuccessData;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.validateGeneratedFiles;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.PRINCIPAL;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.STAGE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestPathForTask;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage.DISABLED;
														
 
															+import static org.apache.hadoop.security.UserGroupInformation.getCurrentUser;
														
 
															+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
														
 
															+
														
 
															+/**
														
 
															+ * Test IO through the stages.
														
 
															+ * This mimics the workflow of a job with two tasks,
														
 
															+ * the first task has two attempts where the second attempt
														
 
															+ * is committed after the first attempt (simulating the
														
 
															+ * failure-during-task-commit which the v2 algorithm cannot
														
 
															+ * handle).
														
 
															+ *
														
 
															+ * The test is ordered and the output dir is not cleaned up
														
 
															+ * after each test case.
														
 
															+ * The last test case MUST perform the cleanup.
														
 
															+ */
														
 
															+@FixMethodOrder(MethodSorters.NAME_ASCENDING)
														
 
															+public class TestJobThroughManifestCommitter
														
 
															+    extends AbstractManifestCommitterTest {
														
 
															+
														
 
															+  /** Destination directory. */
														
 
															+  private Path destDir;
														
 
															+
														
 
															+  /** directory names for the tests. */
														
 
															+  private ManifestCommitterSupport.AttemptDirectories dirs;
														
 
															+
														
 
															+  /**
														
 
															+   * To ensure that the local FS has a shared root path, this is static.
														
 
															+   */
														
 
															+  @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized")
														
 
															+  private static Path sharedTestRoot = null;
														
 
															+
														
 
															+  /**
														
 
															+   * Job ID.
														
 
															+   */
														
 
															+  private String jobId;
														
 
															+
														
 
															+  /**
														
 
															+   * Task 0 attempt 0 ID.
														
 
															+   */
														
 
															+  private String taskAttempt00;
														
 
															+
														
 
															+  /**
														
 
															+   * Task 0 attempt 1 ID.
														
 
															+   */
														
 
															+  private String taskAttempt01;
														
 
															+
														
 
															+  /**
														
 
															+   * Task 1 attempt 0 ID.
														
 
															+   */
														
 
															+  private String taskAttempt10;
														
 
															+
														
 
															+  /**
														
 
															+   * Task 1 attempt 1 ID.
														
 
															+   */
														
 
															+  private String taskAttempt11;
														
 
															+
														
 
															+  /**
														
 
															+   * Stage config for TA00.
														
 
															+   */
														
 
															+  private StageConfig ta00Config;
														
 
															+
														
 
															+  /**
														
 
															+   * Stage config for TA01.
														
 
															+   */
														
 
															+  private StageConfig ta01Config;
														
 
															+
														
 
															+  /**
														
 
															+   * Stage config for TA10.
														
 
															+   */
														
 
															+  private StageConfig ta10Config;
														
 
															+
														
 
															+  /**
														
 
															+   * Stage config for TA11.
														
 
															+   */
														
 
															+  private StageConfig ta11Config;
														
 
															+
														
 
															+  @Override
														
 
															+  public void setup() throws Exception {
														
 
															+    super.setup();
														
 
															+    taskAttempt00 = TASK_IDS.getTaskAttempt(TASK0, TA0);
														
 
															+    taskAttempt01 = TASK_IDS.getTaskAttempt(TASK0, TA1);
														
 
															+    taskAttempt10 = TASK_IDS.getTaskAttempt(TASK1, TA0);
														
 
															+    taskAttempt11 = TASK_IDS.getTaskAttempt(TASK1, TA1);
														
 
															+    setSharedPath(path("TestJobThroughManifestCommitter"));
														
 
															+    // add a dir with a space in.
														
 
															+    destDir = new Path(sharedTestRoot, "out put");
														
 
															+    jobId = TASK_IDS.getJobId();
														
 
															+    // then the specific path underneath that for the attempt.
														
 
															+    dirs = new ManifestCommitterSupport.AttemptDirectories(destDir,
														
 
															+        jobId, 1);
														
 
															+
														
 
															+    // config for job attempt 1, task 00
														
 
															+    setJobStageConfig(createStageConfigForJob(JOB1, destDir).build());
														
 
															+    ta00Config = createStageConfig(JOB1, TASK0, TA0, destDir).build();
														
 
															+    ta01Config = createStageConfig(JOB1, TASK0, TA1, destDir).build();
														
 
															+    ta10Config = createStageConfig(JOB1, TASK1, TA0, destDir).build();
														
 
															+    ta11Config = createStageConfig(JOB1, TASK1, TA1, destDir).build();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Test dir deletion is removed from test case teardown so the
														
 
															+   * subsequent tests see the output.
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected void deleteTestDirInTeardown() throws IOException {
														
 
															+    /* no-op */
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Override point and something to turn on/off when exploring what manifests look like.
														
 
															+   * Stores where storage is billed MUST enable this.
														
 
															+   * @return true if, at the end of the run, the test dir should be deleted.
														
 
															+   */
														
 
															+  protected boolean shouldDeleteTestRootAtEndOfTestRun() {
														
 
															+    return false;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Invoke this to clean up the test directories.
														
 
															+   */
														
 
															+  private void deleteSharedTestRoot() throws IOException {
														
 
															+    describe("Deleting shared test root %s", sharedTestRoot);
														
 
															+
														
 
															+    rm(getFileSystem(), sharedTestRoot, true, false);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set the shared test root if not already set.
														
 
															+   * @param path path to set.
														
 
															+   * @return true if the path was set
														
 
															+   */
														
 
															+  private static synchronized boolean setSharedPath(final Path path) {
														
 
															+    if (sharedTestRoot == null) {
														
 
															+      // set this as needed
														
 
															+      LOG.info("Set shared path to {}", path);
														
 
															+      sharedTestRoot = path;
														
 
															+      return true;
														
 
															+    }
														
 
															+    return false;
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0000_setupTestDir() throws Throwable {
														
 
															+    describe("always ensure directory setup is empty");
														
 
															+    deleteSharedTestRoot();
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0100_setupJobStage() throws Throwable {
														
 
															+    describe("Set up a job");
														
 
															+    verifyPath("Job attempt dir",
														
 
															+        dirs.getJobAttemptDir(),
														
 
															+        new SetupJobStage(getJobStageConfig()).apply(true));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * And the check that the stage worked.
														
 
															+   * @throws IOException failure.
														
 
															+   */
														
 
															+  private void verifyJobSetupCompleted() throws IOException {
														
 
															+    assertPathExists("Job attempt dir from test_0100", dirs.getJobAttemptDir());
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0110_setupJobOnlyAllowedOnce() throws Throwable {
														
 
															+    describe("a second creation of a job attempt must fail");
														
 
															+    verifyJobSetupCompleted();
														
 
															+    intercept(FileAlreadyExistsException.class, "", () ->
														
 
															+        new SetupJobStage(getJobStageConfig()).apply(true));
														
 
															+    // job is still there
														
 
															+    assertPathExists("Job attempt dir", dirs.getJobAttemptDir());
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0120_setupJobNewAttemptNumber() throws Throwable {
														
 
															+    describe("Creating a new job attempt is supported");
														
 
															+    verifyJobSetupCompleted();
														
 
															+    Path path = pathMustExist("Job attempt 2 dir",
														
 
															+        new SetupJobStage(createStageConfig(2, -1, 0, destDir))
														
 
															+            .apply(false));
														
 
															+    Assertions.assertThat(path)
														
 
															+        .describedAs("Stage created path")
														
 
															+        .isNotEqualTo(dirs.getJobAttemptDir());
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0200_setupTask00() throws Throwable {
														
 
															+    describe("Set up a task; job must have been set up first");
														
 
															+    verifyJobSetupCompleted();
														
 
															+    verifyPath("Task attempt 00",
														
 
															+        dirs.getTaskAttemptPath(taskAttempt00),
														
 
															+        new SetupTaskStage(ta00Config).apply("first"));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify TA00 is set up.
														
 
															+   */
														
 
															+  private void verifyTaskAttempt00SetUp() throws IOException {
														
 
															+    pathMustExist("Dir from taskAttempt00 setup",
														
 
															+        dirs.getTaskAttemptPath(taskAttempt00));
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0210_setupTask00OnlyAllowedOnce() throws Throwable {
														
 
															+    describe("Second attempt to set up task00 must fail.");
														
 
															+    verifyTaskAttempt00SetUp();
														
 
															+    intercept(FileAlreadyExistsException.class, "second", () ->
														
 
															+        new SetupTaskStage(ta00Config).apply("second"));
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0220_setupTask01() throws Throwable {
														
 
															+    describe("Setup task attempt 01");
														
 
															+    verifyTaskAttempt00SetUp();
														
 
															+    verifyPath("Task attempt 01",
														
 
															+        dirs.getTaskAttemptPath(taskAttempt01),
														
 
															+        new SetupTaskStage(ta01Config)
														
 
															+            .apply("01"));
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0230_setupTask10() throws Throwable {
														
 
															+    describe("Setup task attempt 10");
														
 
															+    verifyJobSetupCompleted();
														
 
															+    verifyPath("Task attempt 10",
														
 
															+        dirs.getTaskAttemptPath(taskAttempt10),
														
 
															+        new SetupTaskStage(ta10Config)
														
 
															+            .apply("10"));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Setup then abort task 11 before creating any files;
														
 
															+   * verify that commit fails before creating a manifest file.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void test_0240_setupThenAbortTask11() throws Throwable {
														
 
															+    describe("Setup then abort task attempt 11");
														
 
															+    verifyJobSetupCompleted();
														
 
															+    Path ta11Path = new SetupTaskStage(ta11Config).apply("11");
														
 
															+    Path deletedDir = new AbortTaskStage(ta11Config).apply(false);
														
 
															+    Assertions.assertThat(ta11Path)
														
 
															+        .isEqualTo(deletedDir);
														
 
															+    assertPathDoesNotExist("aborted directory", ta11Path);
														
 
															+    // execute will fail as there's no dir to list.
														
 
															+    intercept(FileNotFoundException.class, () ->
														
 
															+        new CommitTaskStage(ta11Config).apply(null));
														
 
															+    assertPathDoesNotExist("task manifest",
														
 
															+        manifestPathForTask(dirs.getTaskManifestDir(),
														
 
															+            TASK_IDS.getTaskId(TASK1)));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute TA01 by generating a lot of files in its directory
														
 
															+   * then committing the task attempt.
														
 
															+   * The manifest at the task path (i.e. the record of which attempt's
														
 
															+   * output is to be used) MUST now have been generated by this TA.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void test_0300_executeTask00() throws Throwable {
														
 
															+    describe("Create the files for Task 00, then commit the task");
														
 
															+    List<Path> files = createFilesOrDirs(dirs.getTaskAttemptPath(taskAttempt00),
														
 
															+        "part-00", getExecutorService(),
														
 
															+        DEPTH, WIDTH, FILES_PER_DIRECTORY, false);
														
 
															+    // saves the task manifest to the job dir
														
 
															+    CommitTaskStage.Result result = new CommitTaskStage(ta00Config)
														
 
															+        .apply(null);
														
 
															+    verifyPathExists(getFileSystem(), "manifest",
														
 
															+        result.getPath());
														
 
															+
														
 
															+    TaskManifest manifest = result.getTaskManifest();
														
 
															+    manifest.validate();
														
 
															+    // clear the IOStats to reduce the size of the printed JSON.
														
 
															+    manifest.setIOStatistics(null);
														
 
															+    LOG.info("Task Manifest {}", manifest.toJson());
														
 
															+    validateTaskAttemptManifest(this.taskAttempt00, files, manifest);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Validate the manifest of a task attempt.
														
 
															+   * @param attemptId attempt ID
														
 
															+   * @param files files which were created.
														
 
															+   * @param manifest manifest
														
 
															+   * @throws IOException IO problem
														
 
															+   */
														
 
															+  protected void validateTaskAttemptManifest(
														
 
															+      String attemptId,
														
 
															+      List<Path> files,
														
 
															+      TaskManifest manifest) throws IOException {
														
 
															+
														
 
															+    verifyManifestTaskAttemptID(manifest, attemptId);
														
 
															+
														
 
															+    // validate the manifest
														
 
															+    verifyManifestFilesMatch(manifest, files);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute TA01 by generating a lot of files in its directory
														
 
															+   * then committing the task attempt.
														
 
															+   * The manifest at the task path (i.e. the record of which attempt's
														
 
															+   * output is to be used) MUST now have been generated by this TA.
														
 
															+   * Any existing manifest will have been overwritten.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void test_0310_executeTask01() throws Throwable {
														
 
															+    describe("Create the files for Task 01, then commit the task");
														
 
															+    List<Path> files = createFilesOrDirs(dirs.getTaskAttemptPath(taskAttempt01),
														
 
															+        "part-00", getExecutorService(),
														
 
															+        DEPTH, WIDTH, FILES_PER_DIRECTORY, false);
														
 
															+    // saves the task manifest to the job dir
														
 
															+    CommitTaskStage.Result result = new CommitTaskStage(ta01Config)
														
 
															+        .apply(null);
														
 
															+    Path manifestPath = verifyPathExists(getFileSystem(), "manifest",
														
 
															+        result.getPath()).getPath();
														
 
															+
														
 
															+    // load the manifest from the FS, not the return value,
														
 
															+    // so we can verify that last task to commit wins.
														
 
															+    TaskManifest manifest = TaskManifest.load(getFileSystem(), manifestPath);
														
 
															+    manifest.validate();
														
 
															+    // clear the IOStats to reduce the size of the printed JSON.
														
 
															+    manifest.setIOStatistics(null);
														
 
															+    LOG.info("Task Manifest {}", manifest.toJson());
														
 
															+
														
 
															+    validateTaskAttemptManifest(taskAttempt01, files, manifest);
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Second task writes to more directories, but fewer files per dir.
														
 
															+   * This ensures that there will dirs here which aren't in the first
														
 
															+   * attempt.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void test_0320_executeTask10() throws Throwable {
														
 
															+    describe("Create the files for Task 10, then commit the task");
														
 
															+    List<Path> files = createFilesOrDirs(
														
 
															+        dirs.getTaskAttemptPath(ta10Config.getTaskAttemptId()),
														
 
															+        "part-01", getExecutorService(),
														
 
															+        DEPTH, WIDTH + 1, FILES_PER_DIRECTORY - 1, false);
														
 
															+    // saves the task manifest to the job dir
														
 
															+    CommitTaskStage.Result result = new CommitTaskStage(ta10Config)
														
 
															+        .apply(null);
														
 
															+    TaskManifest manifest = result.getTaskManifest();
														
 
															+    validateTaskAttemptManifest(taskAttempt10, files, manifest);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0340_setupThenAbortTask11() throws Throwable {
														
 
															+    describe("Setup then abort task attempt 11");
														
 
															+    Path ta11Path = new SetupTaskStage(ta11Config).apply("11");
														
 
															+    createFilesOrDirs(
														
 
															+        ta11Path,
														
 
															+        "part-01", getExecutorService(),
														
 
															+        2, 1, 1, false);
														
 
															+
														
 
															+    new AbortTaskStage(ta11Config).apply(false);
														
 
															+    assertPathDoesNotExist("aborted directory", ta11Path);
														
 
															+    // execute will fail as there's no dir to list.
														
 
															+    intercept(FileNotFoundException.class, () ->
														
 
															+        new CommitTaskStage(ta11Config).apply(null));
														
 
															+
														
 
															+    // and the manifest MUST be unchanged from the previous stage
														
 
															+    Path manifestPathForTask1 = manifestPathForTask(dirs.getTaskManifestDir(),
														
 
															+        TASK_IDS.getTaskId(TASK1));
														
 
															+    verifyManifestTaskAttemptID(
														
 
															+        TaskManifest.load(getFileSystem(), manifestPathForTask1),
														
 
															+        taskAttempt10);
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load all the committed manifests, which must be TA01 (last of
														
 
															+   * task 0 to commit) and TA10.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void test_0400_loadManifests() throws Throwable {
														
 
															+    describe("Load all manifests; committed must be TA01 and TA10");
														
 
															+    LoadManifestsStage.Result result
														
 
															+        = new LoadManifestsStage(getJobStageConfig()).apply(true);
														
 
															+    String summary = result.getSummary().toString();
														
 
															+    LOG.info("Manifest summary {}", summary);
														
 
															+    List<TaskManifest> manifests = result.getManifests();
														
 
															+    Assertions.assertThat(manifests)
														
 
															+        .describedAs("Loaded manifests in %s", summary)
														
 
															+        .hasSize(2);
														
 
															+    Map<String, TaskManifest> manifestMap = toMap(manifests);
														
 
															+    verifyManifestTaskAttemptID(
														
 
															+        manifestMap.get(taskAttempt01), taskAttempt01);
														
 
															+    verifyManifestTaskAttemptID(
														
 
															+        manifestMap.get(taskAttempt10), taskAttempt10);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0410_commitJob() throws Throwable {
														
 
															+    describe("Commit the job");
														
 
															+    CommitJobStage stage = new CommitJobStage(getJobStageConfig());
														
 
															+    stage.apply(new CommitJobStage.Arguments(true, false, null, DISABLED));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Validate that the job output is good by invoking the
														
 
															+   * {@link ValidateRenamedFilesStage} stage to
														
 
															+   * validate all the manifests.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void test_0420_validateJob() throws Throwable {
														
 
															+    describe("Validate the output of the job through the validation"
														
 
															+        + " stage");
														
 
															+
														
 
															+
														
 
															+    // load in the success data.
														
 
															+    ManifestSuccessData successData = loadAndPrintSuccessData(
														
 
															+        getFileSystem(),
														
 
															+        getJobStageConfig().getJobSuccessMarkerPath());
														
 
															+
														
 
															+    // load manifests stage will load all the task manifests again
														
 
															+    List<TaskManifest> manifests = new LoadManifestsStage(getJobStageConfig())
														
 
															+        .apply(true).getManifests();
														
 
															+    // Now verify their files exist, returning the list of renamed files.
														
 
															+    List<String> committedFiles = new ValidateRenamedFilesStage(getJobStageConfig())
														
 
															+        .apply(manifests)
														
 
															+        .stream().map(FileEntry::getDest)
														
 
															+        .collect(Collectors.toList());
														
 
															+
														
 
															+    // verify that the list of committed files also matches
														
 
															+    // that in the _SUCCESS file
														
 
															+    // note: there's a limit to the #of files in the SUCCESS file
														
 
															+    // to stop writing it slowing down jobs; therefore we don't
														
 
															+    // make a simple "all must match check
														
 
															+    Assertions.assertThat(committedFiles)
														
 
															+        .containsAll(successData.getFilenames());
														
 
															+
														
 
															+    // now patch one of the manifest files by editing an entry
														
 
															+    FileEntry entry = manifests.get(0).getFilesToCommit().get(0);
														
 
															+    // no longer exists.
														
 
															+    String oldName = entry.getDest();
														
 
															+    String newName = oldName + ".missing";
														
 
															+    entry.setDest(newName);
														
 
															+
														
 
															+    // validation will now fail
														
 
															+    intercept(OutputValidationException.class, ".missing", () ->
														
 
															+        new ValidateRenamedFilesStage(getJobStageConfig())
														
 
															+            .apply(manifests));
														
 
															+
														
 
															+    // restore the name, but change the size
														
 
															+    entry.setDest(oldName);
														
 
															+    entry.setSize(128_000_000);
														
 
															+    intercept(OutputValidationException.class, () ->
														
 
															+        new ValidateRenamedFilesStage(getJobStageConfig())
														
 
															+            .apply(manifests));
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0430_validateStatistics() throws Throwable {
														
 
															+    // load in the success data.
														
 
															+    ManifestSuccessData successData = ManifestSuccessData.load(
														
 
															+        getFileSystem(),
														
 
															+        getJobStageConfig().getJobSuccessMarkerPath());
														
 
															+    String json = successData.toJson();
														
 
															+    LOG.info("Success data is {}", json);
														
 
															+    Assertions.assertThat(successData)
														
 
															+        .describedAs("Manifest " + json)
														
 
															+        .returns(NetUtils.getLocalHostname(),
														
 
															+            ManifestSuccessData::getHostname)
														
 
															+        .returns(MANIFEST_COMMITTER_CLASSNAME,
														
 
															+            ManifestSuccessData::getCommitter)
														
 
															+        .returns(jobId,
														
 
															+            ManifestSuccessData::getJobId)
														
 
															+        .returns(true,
														
 
															+            ManifestSuccessData::getSuccess)
														
 
															+        .returns(JOB_ID_SOURCE_MAPREDUCE,
														
 
															+            ManifestSuccessData::getJobIdSource);
														
 
															+    // diagnostics
														
 
															+    Assertions.assertThat(successData.getDiagnostics())
														
 
															+        .containsEntry(PRINCIPAL,
														
 
															+            getCurrentUser().getShortUserName())
														
 
															+        .containsEntry(STAGE, OP_STAGE_JOB_COMMIT);
														
 
															+
														
 
															+    // and stats
														
 
															+    IOStatisticsSnapshot iostats = successData.getIOStatistics();
														
 
															+
														
 
															+    int files = successData.getFilenames().size();
														
 
															+    verifyStatisticCounterValue(iostats,
														
 
															+        OP_STAGE_JOB_COMMIT, 1);
														
 
															+    assertThatStatisticCounter(iostats,
														
 
															+        COMMITTER_FILES_COMMITTED_COUNT)
														
 
															+        .isGreaterThanOrEqualTo(files);
														
 
															+    Long totalFiles = iostats.counters().get(COMMITTER_FILES_COMMITTED_COUNT);
														
 
															+    verifyStatisticCounterValue(iostats,
														
 
															+        COMMITTER_BYTES_COMMITTED_COUNT, totalFiles * 2);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_440_validateSuccessFiles() throws Throwable {
														
 
															+
														
 
															+    // load in the success data.
														
 
															+    final FileSystem fs = getFileSystem();
														
 
															+    ManifestSuccessData successData = loadAndPrintSuccessData(
														
 
															+        fs,
														
 
															+        getJobStageConfig().getJobSuccessMarkerPath());
														
 
															+    validateGeneratedFiles(fs,
														
 
															+        getJobStageConfig().getDestinationDir(),
														
 
															+        successData, false);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void test_0900_cleanupJob() throws Throwable {
														
 
															+    describe("Cleanup job");
														
 
															+    CleanupJobStage.Arguments arguments = new CleanupJobStage.Arguments(
														
 
															+        OP_STAGE_JOB_CLEANUP, true, true, false);
														
 
															+    // the first run will list the three task attempt dirs and delete each
														
 
															+    // one before the toplevel dir.
														
 
															+    CleanupJobStage.Result result = new CleanupJobStage(
														
 
															+        getJobStageConfig()).apply(arguments);
														
 
															+    assertCleanupResult(result, CleanupJobStage.Outcome.PARALLEL_DELETE, 1 + 3);
														
 
															+    assertPathDoesNotExist("Job attempt dir", result.getDirectory());
														
 
															+
														
 
															+    // not an error if we retry and the dir isn't there
														
 
															+    result = new CleanupJobStage(getJobStageConfig()).apply(arguments);
														
 
															+    assertCleanupResult(result, CleanupJobStage.Outcome.NOTHING_TO_CLEAN_UP, 0);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Needed to clean up the shared test root, as test case teardown
														
 
															+   * does not do it.
														
 
															+   */
														
 
															+  //@Test
														
 
															+  public void test_9999_cleanupTestDir() throws Throwable {
														
 
															+    if (shouldDeleteTestRootAtEndOfTestRun()) {
														
 
															+      deleteSharedTestRoot();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestLoadManifestsStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestLoadManifestsStage.java
@@ -0,0 +1,141 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.util.List;
														
 
															+import java.util.Set;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+import org.junit.Test;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CreateOutputDirectoriesStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.LoadManifestsStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
														
 
															+
														
 
															+/**
														
 
															+ * Test loading manifests from a store.
														
 
															+ * By not creating files we can simulate a large job just by
														
 
															+ * creating the manifests.
														
 
															+ * The SaveTaskManifestStage stage is used for the save operation;
														
 
															+ * this does a save + rename.
														
 
															+ * For better test performance against a remote store, a thread
														
 
															+ * pool is used to save the manifests in parallel.
														
 
															+ */
														
 
															+public class TestLoadManifestsStage extends AbstractManifestCommitterTest {
														
 
															+
														
 
															+  private int taskAttemptCount;
														
 
															+
														
 
															+  /**
														
 
															+   * How many task attempts to make?
														
 
															+   * Override point.
														
 
															+   * @return a number greater than 0.
														
 
															+   */
														
 
															+  protected int numberOfTaskAttempts() {
														
 
															+    return ManifestCommitterTestSupport.NUMBER_OF_TASK_ATTEMPTS;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void setup() throws Exception {
														
 
															+    super.setup();
														
 
															+    taskAttemptCount = numberOfTaskAttempts();
														
 
															+    Assertions.assertThat(taskAttemptCount)
														
 
															+        .describedAs("Task attempt count")
														
 
															+        .isGreaterThan(0);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Build a large number of manifests, but without the real files
														
 
															+   * and directories.
														
 
															+   * Save the manifests under the job attempt dir, then load
														
 
															+   * them via the {@link LoadManifestsStage}.
														
 
															+   * The directory preparation process is then executed after this.
														
 
															+   * Because we know each task attempt creates the same number of directories,
														
 
															+   * they will all be merged and so only a limited number of output dirs
														
 
															+   * will be created.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testSaveThenLoadManyManifests() throws Throwable {
														
 
															+
														
 
															+    describe("Creating many manifests with fake file/dir entries,"
														
 
															+        + " load them and prepare the output dirs.");
														
 
															+
														
 
															+    int filesPerTaskAttempt = 10;
														
 
															+    LOG.info("Number of task attempts: {}, files per task attempt {}",
														
 
															+        taskAttemptCount, filesPerTaskAttempt);
														
 
															+
														
 
															+    setJobStageConfig(createStageConfigForJob(JOB1, getDestDir()));
														
 
															+
														
 
															+    // set up the job.
														
 
															+    new SetupJobStage(getJobStageConfig()).apply(false);
														
 
															+
														
 
															+    LOG.info("Creating manifest files for {}", taskAttemptCount);
														
 
															+
														
 
															+    executeTaskAttempts(taskAttemptCount, filesPerTaskAttempt);
														
 
															+
														
 
															+    LOG.info("Loading in the manifests");
														
 
															+
														
 
															+    // Load in the manifests
														
 
															+    LoadManifestsStage stage = new LoadManifestsStage(
														
 
															+        getJobStageConfig());
														
 
															+
														
 
															+    LoadManifestsStage.Result result = stage.apply(true);
														
 
															+    LoadManifestsStage.SummaryInfo summary = result.getSummary();
														
 
															+    List<TaskManifest> loadedManifests = result.getManifests();
														
 
															+
														
 
															+    Assertions.assertThat(summary.getManifestCount())
														
 
															+        .describedAs("Manifest count of  %s", summary)
														
 
															+        .isEqualTo(taskAttemptCount);
														
 
															+    Assertions.assertThat(summary.getFileCount())
														
 
															+        .describedAs("File count of  %s", summary)
														
 
															+        .isEqualTo(taskAttemptCount * (long) filesPerTaskAttempt);
														
 
															+    Assertions.assertThat(summary.getTotalFileSize())
														
 
															+        .describedAs("File Size of  %s", summary)
														
 
															+        .isEqualTo(getTotalDataSize());
														
 
															+
														
 
															+    // now that manifest list.
														
 
															+    List<String> manifestTaskIds = loadedManifests.stream()
														
 
															+        .map(TaskManifest::getTaskID)
														
 
															+        .collect(Collectors.toList());
														
 
															+    Assertions.assertThat(getTaskIds())
														
 
															+        .describedAs("Task IDs of all tasks")
														
 
															+        .containsExactlyInAnyOrderElementsOf(manifestTaskIds);
														
 
															+
														
 
															+    // now let's see about aggregating a large set of directories
														
 
															+    Set<Path> createdDirectories = new CreateOutputDirectoriesStage(
														
 
															+        getJobStageConfig())
														
 
															+        .apply(loadedManifests)
														
 
															+        .getCreatedDirectories();
														
 
															+
														
 
															+    // but after the merge process, only one per generated file output
														
 
															+    // dir exists
														
 
															+    Assertions.assertThat(createdDirectories)
														
 
															+        .describedAs("Directories created")
														
 
															+        .hasSize(filesPerTaskAttempt);
														
 
															+
														
 
															+    // and skipping the rename stage (which is going to fail),
														
 
															+    // go straight to cleanup
														
 
															+    new CleanupJobStage(getJobStageConfig()).apply(
														
 
															+        new CleanupJobStage.Arguments("", true, true, false));
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestManifestCommitProtocol.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestManifestCommitProtocol.java
@@ -0,0 +1,1801 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.File;
														
 
															+import java.io.FileNotFoundException;
														
 
															+import java.io.IOException;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.Arrays;
														
 
															+import java.util.List;
														
 
															+import java.util.Map;
														
 
															+import java.util.concurrent.ExecutorService;
														
 
															+import java.util.concurrent.TimeUnit;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+import org.junit.AfterClass;
														
 
															+import org.junit.Test;
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.conf.Configuration;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.FileUtil;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.PathFilter;
														
 
															+import org.apache.hadoop.fs.PathIOException;
														
 
															+import org.apache.hadoop.fs.contract.ContractTestUtils;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
														
 
															+import org.apache.hadoop.fs.statistics.IOStatisticsSupport;
														
 
															+import org.apache.hadoop.io.IntWritable;
														
 
															+import org.apache.hadoop.io.LongWritable;
														
 
															+import org.apache.hadoop.io.MapFile;
														
 
															+import org.apache.hadoop.io.NullWritable;
														
 
															+import org.apache.hadoop.io.Text;
														
 
															+import org.apache.hadoop.io.Writable;
														
 
															+import org.apache.hadoop.io.WritableComparable;
														
 
															+import org.apache.hadoop.mapred.JobConf;
														
 
															+import org.apache.hadoop.mapreduce.Job;
														
 
															+import org.apache.hadoop.mapreduce.JobContext;
														
 
															+import org.apache.hadoop.mapreduce.JobStatus;
														
 
															+import org.apache.hadoop.mapreduce.MRJobConfig;
														
 
															+import org.apache.hadoop.mapreduce.OutputCommitter;
														
 
															+import org.apache.hadoop.mapreduce.OutputFormat;
														
 
															+import org.apache.hadoop.mapreduce.RecordWriter;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptContext;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptID;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.task.JobContextImpl;
														
 
															+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
														
 
															+import org.apache.hadoop.util.DurationInfo;
														
 
															+import org.apache.hadoop.util.ReflectionUtils;
														
 
															+import org.apache.hadoop.util.concurrent.HadoopExecutors;
														
 
															+import org.apache.hadoop.util.functional.RemoteIterators;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+import static org.apache.hadoop.fs.contract.ContractTestUtils.listChildren;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsSourceToString;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory.COMMITTER_FACTORY_CLASS;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_FACTORY;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SPARK_WRITE_UUID;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_BYTES_COMMITTED_COUNT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_FILES_COMMITTED_COUNT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASKS_COMPLETED_COUNT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_LOAD_MANIFEST;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_ABORT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createJobSummaryFilename;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.randomJobId;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.validateSuccessFile;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.STAGE;
														
 
															+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
														
 
															+
														
 
															+/**
														
 
															+ * This is a contract test for the commit protocol on a target filesystem.
														
 
															+ * It is subclassed in the ABFS integration tests and elsewhere.
														
 
															+ * Derived from the S3A protocol suite, which was itself based off
														
 
															+ * the test suite {@code TestFileOutputCommitter}.
														
 
															+ *
														
 
															+ * Some of the methods trigger java warnings about unchecked casts;
														
 
															+ * it's impossible to remove them, so the checks are suppressed.
														
 
															+ */
														
 
															+@SuppressWarnings("unchecked")
														
 
															+public class TestManifestCommitProtocol
														
 
															+    extends AbstractManifestCommitterTest {
														
 
															+
														
 
															+  private static final Logger LOG =
														
 
															+      LoggerFactory.getLogger(TestManifestCommitProtocol.class);
														
 
															+
														
 
															+  private static final String SUB_DIR = "SUB_DIR";
														
 
															+
														
 
															+  /**
														
 
															+   * Part of the name of the output of task attempt 0.
														
 
															+   */
														
 
															+  protected static final String PART_00000 = "part-m-00000";
														
 
															+
														
 
															+  private static final Text KEY_1 = new Text("key1");
														
 
															+
														
 
															+  private static final Text KEY_2 = new Text("key2");
														
 
															+
														
 
															+  private static final Text VAL_1 = new Text("val1");
														
 
															+
														
 
															+  private static final Text VAL_2 = new Text("val2");
														
 
															+
														
 
															+  /**
														
 
															+   * Snapshot of stats, which will be collected from
														
 
															+   * committers.
														
 
															+   */
														
 
															+  private static final IOStatisticsSnapshot IOSTATISTICS =
														
 
															+      IOStatisticsSupport.snapshotIOStatistics();
														
 
															+
														
 
															+  /**
														
 
															+   * Job ID for jobs.
														
 
															+   */
														
 
															+  private final String jobId;
														
 
															+
														
 
															+  /**
														
 
															+   * A random task attempt id for testing.
														
 
															+   */
														
 
															+  private final String attempt0;
														
 
															+
														
 
															+  /**
														
 
															+   *  Attempt 0's task attempt ID.
														
 
															+   */
														
 
															+  private final TaskAttemptID taskAttempt0;
														
 
															+
														
 
															+  /**
														
 
															+   * TA 1.
														
 
															+   */
														
 
															+  private final TaskAttemptID taskAttempt1;
														
 
															+
														
 
															+  /**
														
 
															+   * Attempt 1 string value.
														
 
															+   */
														
 
															+  private final String attempt1;
														
 
															+
														
 
															+
														
 
															+  /** A job to abort in test case teardown. */
														
 
															+  private final List<JobData> abortInTeardown = new ArrayList<>(1);
														
 
															+
														
 
															+  /**
														
 
															+   * Output directory.
														
 
															+   * This is the directory into which output goes;
														
 
															+   * all the job files go in _temporary underneath.
														
 
															+   */
														
 
															+  private Path outputDir;
														
 
															+
														
 
															+  /**
														
 
															+   * Committer factory which calls back into
														
 
															+   * {@link #createCommitter(Path, TaskAttemptContext)}.
														
 
															+   */
														
 
															+  private final LocalCommitterFactory
														
 
															+      localCommitterFactory = new LocalCommitterFactory();
														
 
															+
														
 
															+  /**
														
 
															+   * Clean up the output dir. No-op if
														
 
															+   * {@link #outputDir} is null.
														
 
															+   * @throws IOException failure to delete
														
 
															+   */
														
 
															+  private void cleanupOutputDir() throws IOException {
														
 
															+    if (outputDir != null) {
														
 
															+      getFileSystem().delete(outputDir, true);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Constructor.
														
 
															+   */
														
 
															+  public TestManifestCommitProtocol() {
														
 
															+    ManifestCommitterTestSupport.JobAndTaskIDsForTests taskIDs
														
 
															+        = new ManifestCommitterTestSupport.JobAndTaskIDsForTests(2, 2);
														
 
															+    jobId = taskIDs.getJobId();
														
 
															+    attempt0 = taskIDs.getTaskAttempt(0, 0);
														
 
															+    taskAttempt0 = taskIDs.getTaskAttemptIdType(0, 0);
														
 
															+    attempt1 = taskIDs.getTaskAttempt(0, 1);
														
 
															+    taskAttempt1 = taskIDs.getTaskAttemptIdType(0, 1);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * This must return the name of a suite which is unique to the test.
														
 
															+   * @return a string which must be unique and a valid path.
														
 
															+   */
														
 
															+  protected String suitename() {
														
 
															+    return "TestManifestCommitProtocolLocalFS";
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get the log; can be overridden for test case log.
														
 
															+   * @return a log.
														
 
															+   */
														
 
															+  public Logger log() {
														
 
															+    return LOG;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Overridden method returns the suitename as well as the method name,
														
 
															+   * so if more than one committer test is run in parallel, paths are
														
 
															+   * isolated.
														
 
															+   * @return a name for a method, unique across the suites and test cases.
														
 
															+   */
														
 
															+  @Override
														
 
															+  protected String getMethodName() {
														
 
															+    return suitename() + "-" + super.getMethodName();
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void setup() throws Exception {
														
 
															+    super.setup();
														
 
															+
														
 
															+    outputDir = path(getMethodName());
														
 
															+    cleanupOutputDir();
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void teardown() throws Exception {
														
 
															+    describe("teardown");
														
 
															+    Thread.currentThread().setName("teardown");
														
 
															+    for (JobData jobData : abortInTeardown) {
														
 
															+      // stop the job
														
 
															+      abortJobQuietly(jobData);
														
 
															+      // and then get its statistics
														
 
															+      IOSTATISTICS.aggregate(jobData.committer.getIOStatistics());
														
 
															+    }
														
 
															+    try {
														
 
															+      cleanupOutputDir();
														
 
															+    } catch (IOException e) {
														
 
															+      log().info("Exception during cleanup", e);
														
 
															+    }
														
 
															+    super.teardown();
														
 
															+  }
														
 
															+
														
 
															+  @AfterClass
														
 
															+  public static void logAggregateIOStatistics() {
														
 
															+    LOG.info("Final IOStatistics {}",
														
 
															+        ioStatisticsToPrettyString(IOSTATISTICS));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add the specified job to the current list of jobs to abort in teardown.
														
 
															+   * @param jobData job data.
														
 
															+   */
														
 
															+  protected void abortInTeardown(JobData jobData) {
														
 
															+    abortInTeardown.add(jobData);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  protected Configuration createConfiguration() {
														
 
															+    Configuration conf = super.createConfiguration();
														
 
															+    bindCommitter(conf);
														
 
															+    return conf;
														
 
															+  }
														
 
															+
														
 
															+  /***
														
 
															+   * Set job up to use the manifest committer.
														
 
															+   * @param conf configuration to set up
														
 
															+   */
														
 
															+  protected void bindCommitter(Configuration conf) {
														
 
															+    conf.set(COMMITTER_FACTORY_CLASS, MANIFEST_COMMITTER_FACTORY);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a committer for a task.
														
 
															+   * @param context task context
														
 
															+   * @return new committer
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  protected ManifestCommitter createCommitter(
														
 
															+      TaskAttemptContext context) throws IOException {
														
 
															+    return createCommitter(getOutputDir(), context);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a committer for a task and a given output path.
														
 
															+   * @param outputPath path
														
 
															+   * @param context task context
														
 
															+   * @return new committer
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  protected ManifestCommitter createCommitter(
														
 
															+      Path outputPath,
														
 
															+      TaskAttemptContext context) throws IOException {
														
 
															+    return new ManifestCommitter(outputPath, context);
														
 
															+  }
														
 
															+
														
 
															+  protected Path getOutputDir() {
														
 
															+    return outputDir;
														
 
															+  }
														
 
															+
														
 
															+  protected String getJobId() {
														
 
															+    return jobId;
														
 
															+  }
														
 
															+
														
 
															+  protected String getAttempt0() {
														
 
															+    return attempt0;
														
 
															+  }
														
 
															+
														
 
															+  protected TaskAttemptID getTaskAttempt0() {
														
 
															+    return taskAttempt0;
														
 
															+  }
														
 
															+
														
 
															+  protected String getAttempt1() {
														
 
															+    return attempt1;
														
 
															+  }
														
 
															+
														
 
															+  protected TaskAttemptID getTaskAttempt1() {
														
 
															+    return taskAttempt1;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Functional interface for creating committers, designed to allow
														
 
															+   * different factories to be used to create different failure modes.
														
 
															+   */
														
 
															+  @FunctionalInterface
														
 
															+  public interface CommitterFactory {
														
 
															+
														
 
															+    /**
														
 
															+     * Create a committer for a task.
														
 
															+     * @param context task context
														
 
															+     * @return new committer
														
 
															+     * @throws IOException failure
														
 
															+     */
														
 
															+    ManifestCommitter createCommitter(
														
 
															+        TaskAttemptContext context) throws IOException;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * The normal committer creation factory, uses the abstract methods
														
 
															+   * in the class.
														
 
															+   */
														
 
															+  protected class LocalCommitterFactory implements CommitterFactory {
														
 
															+
														
 
															+    @Override
														
 
															+    public ManifestCommitter createCommitter(TaskAttemptContext context)
														
 
															+        throws IOException {
														
 
															+      return TestManifestCommitProtocol.this
														
 
															+          .createCommitter(context);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that for a given output, the job context returns a manifest
														
 
															+   * committer factory. This is what FileOutputFormat does internally,
														
 
															+   * and is needed to make sure that the relevant settings are being passed
														
 
															+   * around.
														
 
															+   * @param context job/task context
														
 
															+   * @param output destination path.
														
 
															+   */
														
 
															+  protected void assertCommitterFactoryIsManifestCommitter(
														
 
															+      JobContext context, Path output) {
														
 
															+
														
 
															+    final Configuration conf = context.getConfiguration();
														
 
															+    // check one: committer
														
 
															+    assertConfigurationUsesManifestCommitter(conf);
														
 
															+    final String factoryName = conf.get(COMMITTER_FACTORY_CLASS, "");
														
 
															+    final PathOutputCommitterFactory factory
														
 
															+        = PathOutputCommitterFactory.getCommitterFactory(
														
 
															+        output,
														
 
															+        conf);
														
 
															+    Assertions.assertThat(factory)
														
 
															+        .describedAs("Committer for output path %s"
														
 
															+                + " and factory name \"%s\"",
														
 
															+            output, factoryName)
														
 
															+        .isInstanceOf(ManifestCommitterFactory.class);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * This is to debug situations where the test committer factory
														
 
															+   * on tasks was binding to FileOutputCommitter even when
														
 
															+   * tests were overriding it.
														
 
															+   * @param conf configuration to probe.
														
 
															+   */
														
 
															+  private void assertConfigurationUsesManifestCommitter(
														
 
															+      Configuration conf) {
														
 
															+    final String factoryName = conf.get(COMMITTER_FACTORY_CLASS, null);
														
 
															+    Assertions.assertThat(factoryName)
														
 
															+        .describedAs("Value of %s", COMMITTER_FACTORY_CLASS)
														
 
															+        .isEqualTo(MANIFEST_COMMITTER_FACTORY);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Write some text out.
														
 
															+   * @param context task
														
 
															+   * @throws IOException IO failure
														
 
															+   * @throws InterruptedException write interrupted
														
 
															+   * @return the path written to
														
 
															+   */
														
 
															+  protected Path writeTextOutput(TaskAttemptContext context)
														
 
															+      throws IOException, InterruptedException {
														
 
															+    describe("write output");
														
 
															+    try (DurationInfo d = new DurationInfo(LOG,
														
 
															+        "Writing Text output for task %s", context.getTaskAttemptID())) {
														
 
															+      TextOutputForTests.LoggingLineRecordWriter<Writable, Object> writer
														
 
															+          = new TextOutputForTests<Writable, Object>().getRecordWriter(context);
														
 
															+      writeOutput(writer, context);
														
 
															+      return writer.getDest();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Write the standard output.
														
 
															+   * @param writer record writer
														
 
															+   * @param context task context
														
 
															+   * @throws IOException IO failure
														
 
															+   * @throws InterruptedException write interrupted
														
 
															+   */
														
 
															+  private void writeOutput(
														
 
															+      RecordWriter<Writable, Object> writer,
														
 
															+      TaskAttemptContext context) throws IOException, InterruptedException {
														
 
															+    NullWritable nullWritable = NullWritable.get();
														
 
															+    try (ManifestCommitterTestSupport.CloseWriter<Writable, Object> cw =
														
 
															+             new ManifestCommitterTestSupport.CloseWriter<>(writer, context)) {
														
 
															+      writer.write(KEY_1, VAL_1);
														
 
															+      writer.write(null, nullWritable);
														
 
															+      writer.write(null, VAL_1);
														
 
															+      writer.write(nullWritable, VAL_2);
														
 
															+      writer.write(KEY_2, nullWritable);
														
 
															+      writer.write(KEY_1, null);
														
 
															+      writer.write(null, null);
														
 
															+      writer.write(KEY_2, VAL_2);
														
 
															+      writer.close(context);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Write the output of a map.
														
 
															+   * @param writer record writer
														
 
															+   * @param context task context
														
 
															+   * @throws IOException IO failure
														
 
															+   * @throws InterruptedException write interrupted
														
 
															+   */
														
 
															+  private void writeMapFileOutput(RecordWriter<WritableComparable<?>, Writable> writer,
														
 
															+      TaskAttemptContext context) throws IOException, InterruptedException {
														
 
															+    describe("\nWrite map output");
														
 
															+    try (DurationInfo d = new DurationInfo(LOG,
														
 
															+        "Writing Text output for task %s", context.getTaskAttemptID());
														
 
															+         ManifestCommitterTestSupport.CloseWriter<WritableComparable<?>, Writable> cw =
														
 
															+             new ManifestCommitterTestSupport.CloseWriter<>(writer, context)) {
														
 
															+      for (int i = 0; i < 10; ++i) {
														
 
															+        Text val = ((i & 1) == 1) ? VAL_1 : VAL_2;
														
 
															+        writer.write(new LongWritable(i), val);
														
 
															+      }
														
 
															+      LOG.debug("Closing writer {}", writer);
														
 
															+      writer.close(context);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Details on a job for use in {@code startJob} and elsewhere.
														
 
															+   */
														
 
															+  protected static final class JobData {
														
 
															+
														
 
															+    private final Job job;
														
 
															+
														
 
															+    private final JobContext jContext;
														
 
															+
														
 
															+    private final TaskAttemptContext tContext;
														
 
															+
														
 
															+    private final ManifestCommitter committer;
														
 
															+
														
 
															+    private final Configuration conf;
														
 
															+
														
 
															+    private Path writtenTextPath; // null if not written to
														
 
															+
														
 
															+    public JobData(Job job,
														
 
															+        JobContext jContext,
														
 
															+        TaskAttemptContext tContext,
														
 
															+        ManifestCommitter committer) {
														
 
															+      this.job = job;
														
 
															+      this.jContext = jContext;
														
 
															+      this.tContext = tContext;
														
 
															+      this.committer = committer;
														
 
															+      conf = job.getConfiguration();
														
 
															+    }
														
 
															+
														
 
															+    public String jobId() {
														
 
															+      return committer.getJobUniqueId();
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a new job. Sets the task attempt ID,
														
 
															+   * and output dir; asks for a success marker.
														
 
															+   * @return the new job
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  public Job newJob() throws IOException {
														
 
															+    return newJob(outputDir, getConfiguration(), attempt0);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a new job. Sets the task attempt ID,
														
 
															+   * and output dir; asks for a success marker.
														
 
															+   * Committer factory is set to manifest factory, so is independent
														
 
															+   * of FS schema.
														
 
															+   * @param dir dest dir
														
 
															+   * @param configuration config to get the job from
														
 
															+   * @param taskAttemptId task attempt
														
 
															+   * @return the new job
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  private Job newJob(Path dir, Configuration configuration,
														
 
															+      String taskAttemptId) throws IOException {
														
 
															+    Job job = Job.getInstance(configuration);
														
 
															+    Configuration conf = job.getConfiguration();
														
 
															+    conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskAttemptId);
														
 
															+    enableManifestCommitter(conf);
														
 
															+    FileOutputFormat.setOutputPath(job, dir);
														
 
															+    return job;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Start a job with a committer; optionally write the test data.
														
 
															+   * Always register the job to be aborted (quietly) in teardown.
														
 
															+   * This is, from an "OO-purity perspective" the wrong kind of method to
														
 
															+   * do: it's setting things up, mixing functionality, registering for teardown.
														
 
															+   * Its aim is simple though: a common body of code for starting work
														
 
															+   * in test cases.
														
 
															+   * @param writeText should the text be written?
														
 
															+   * @return the job data 4-tuple
														
 
															+   * @throws IOException IO problems
														
 
															+   * @throws InterruptedException interruption during write
														
 
															+   */
														
 
															+  protected JobData startJob(boolean writeText)
														
 
															+      throws IOException, InterruptedException {
														
 
															+    return startJob(localCommitterFactory, writeText);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Start a job with a committer; optionally write the test data.
														
 
															+   * Always register the job to be aborted (quietly) in teardown.
														
 
															+   * This is, from an "OO-purity perspective" the wrong kind of method to
														
 
															+   * do: it's setting things up, mixing functionality, registering for teardown.
														
 
															+   * Its aim is simple though: a common body of code for starting work
														
 
															+   * in test cases.
														
 
															+   * @param factory the committer factory to use
														
 
															+   * @param writeText should the text be written?
														
 
															+   * @return the job data 4-tuple
														
 
															+   * @throws IOException IO problems
														
 
															+   * @throws InterruptedException interruption during write
														
 
															+   */
														
 
															+  protected JobData startJob(CommitterFactory factory, boolean writeText)
														
 
															+      throws IOException, InterruptedException {
														
 
															+    Job job = newJob();
														
 
															+    Configuration conf = job.getConfiguration();
														
 
															+    assertConfigurationUsesManifestCommitter(conf);
														
 
															+    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0);
														
 
															+    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 1);
														
 
															+    JobContext jContext = new JobContextImpl(conf, taskAttempt0.getJobID());
														
 
															+    TaskAttemptContext tContext = new TaskAttemptContextImpl(conf,
														
 
															+        taskAttempt0);
														
 
															+    ManifestCommitter committer = factory.createCommitter(tContext);
														
 
															+
														
 
															+    // setup
														
 
															+    JobData jobData = new JobData(job, jContext, tContext, committer);
														
 
															+    setupJob(jobData);
														
 
															+    abortInTeardown(jobData);
														
 
															+
														
 
															+    if (writeText) {
														
 
															+      // write output
														
 
															+      jobData.writtenTextPath = writeTextOutput(tContext);
														
 
															+    }
														
 
															+    return jobData;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Set up the job and task.
														
 
															+   * @param jobData job data
														
 
															+   * @throws IOException problems
														
 
															+   */
														
 
															+  protected void setupJob(JobData jobData) throws IOException {
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+    JobContext jContext = jobData.jContext;
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+    describe("\nsetup job");
														
 
															+    try (DurationInfo d = new DurationInfo(LOG,
														
 
															+        "setup job %s", jContext.getJobID())) {
														
 
															+      committer.setupJob(jContext);
														
 
															+    }
														
 
															+    setupCommitter(committer, tContext);
														
 
															+    describe("setup complete");
														
 
															+  }
														
 
															+
														
 
															+  private void setupCommitter(
														
 
															+      final ManifestCommitter committer,
														
 
															+      final TaskAttemptContext tContext) throws IOException {
														
 
															+    try (DurationInfo d = new DurationInfo(LOG,
														
 
															+        "setup task %s", tContext.getTaskAttemptID())) {
														
 
															+      committer.setupTask(tContext);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Abort a job quietly.
														
 
															+   * @param jobData job info
														
 
															+   */
														
 
															+  protected void abortJobQuietly(JobData jobData) {
														
 
															+    abortJobQuietly(jobData.committer, jobData.jContext, jobData.tContext);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Abort a job quietly: first task, then job.
														
 
															+   * @param committer committer
														
 
															+   * @param jContext job context
														
 
															+   * @param tContext task context
														
 
															+   */
														
 
															+  protected void abortJobQuietly(ManifestCommitter committer,
														
 
															+      JobContext jContext,
														
 
															+      TaskAttemptContext tContext) {
														
 
															+    describe("\naborting task");
														
 
															+    try {
														
 
															+      committer.abortTask(tContext);
														
 
															+    } catch (Exception e) {
														
 
															+      log().warn("Exception aborting task:", e);
														
 
															+    }
														
 
															+    describe("\naborting job");
														
 
															+    try {
														
 
															+      committer.abortJob(jContext, JobStatus.State.KILLED);
														
 
															+    } catch (Exception e) {
														
 
															+      log().warn("Exception aborting job", e);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Commit the task and then the job.
														
 
															+   * @param committer committer
														
 
															+   * @param jContext job context
														
 
															+   * @param tContext task context
														
 
															+   * @throws IOException problems
														
 
															+   */
														
 
															+  protected void commitTaskAndJob(ManifestCommitter committer,
														
 
															+      JobContext jContext,
														
 
															+      TaskAttemptContext tContext) throws IOException {
														
 
															+    try (DurationInfo d = new DurationInfo(LOG,
														
 
															+        "committing Job %s", jContext.getJobID())) {
														
 
															+      describe("\ncommitting task");
														
 
															+      committer.commitTask(tContext);
														
 
															+      describe("\ncommitting job");
														
 
															+      committer.commitJob(jContext);
														
 
															+      describe("commit complete\n");
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute work as part of a test, after creating the job.
														
 
															+   * After the execution, {@link #abortJobQuietly(JobData)} is
														
 
															+   * called for abort/cleanup.
														
 
															+   * @param name name of work (for logging)
														
 
															+   * @param action action to execute
														
 
															+   * @throws Exception failure
														
 
															+   */
														
 
															+  protected void executeWork(String name, ActionToTest action)
														
 
															+      throws Exception {
														
 
															+    executeWork(name, startJob(false), action);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute work as part of a test, against the created job.
														
 
															+   * After the execution, {@link #abortJobQuietly(JobData)} is
														
 
															+   * called for abort/cleanup.
														
 
															+   * @param name name of work (for logging)
														
 
															+   * @param jobData job info
														
 
															+   * @param action action to execute
														
 
															+   * @throws Exception failure
														
 
															+   */
														
 
															+  public void executeWork(String name,
														
 
															+      JobData jobData,
														
 
															+      ActionToTest action) throws Exception {
														
 
															+    try (DurationInfo d = new DurationInfo(LOG, "Executing %s", name)) {
														
 
															+      action.exec(jobData.job,
														
 
															+          jobData.jContext,
														
 
															+          jobData.tContext,
														
 
															+          jobData.committer);
														
 
															+    } finally {
														
 
															+      abortJobQuietly(jobData);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load a manifest from the test FS.
														
 
															+   * @param path path
														
 
															+   * @return the manifest
														
 
															+   * @throws IOException failure to load
														
 
															+   */
														
 
															+  TaskManifest loadManifest(Path path) throws IOException {
														
 
															+    return TaskManifest.load(getFileSystem(), path);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify that recovery doesn't work for these committers.
														
 
															+   */
														
 
															+  @Test
														
 
															+  @SuppressWarnings("deprecation")
														
 
															+  public void testRecoveryAndCleanup() throws Exception {
														
 
															+    describe("Test (unsupported) task recovery.");
														
 
															+    JobData jobData = startJob(true);
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+
														
 
															+    Assertions.assertThat(committer.getWorkPath())
														
 
															+        .as("null workPath in committer " + committer)
														
 
															+        .isNotNull();
														
 
															+    Assertions.assertThat(committer.getOutputPath())
														
 
															+        .as("null outputPath in committer " + committer)
														
 
															+        .isNotNull();
														
 
															+
														
 
															+    // Commit the task.
														
 
															+    commitTask(committer, tContext);
														
 
															+
														
 
															+    // load and log the manifest
														
 
															+    final TaskManifest manifest = loadManifest(
														
 
															+        committer.getTaskManifestPath(tContext));
														
 
															+    LOG.info("Manifest {}", manifest);
														
 
															+
														
 
															+    Configuration conf2 = jobData.job.getConfiguration();
														
 
															+    conf2.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0);
														
 
															+    conf2.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 2);
														
 
															+    JobContext jContext2 = new JobContextImpl(conf2, taskAttempt0.getJobID());
														
 
															+    TaskAttemptContext tContext2 = new TaskAttemptContextImpl(conf2,
														
 
															+        taskAttempt0);
														
 
															+    ManifestCommitter committer2 = createCommitter(tContext2);
														
 
															+    committer2.setupJob(tContext2);
														
 
															+
														
 
															+    Assertions.assertThat(committer2.isRecoverySupported())
														
 
															+        .as("recoverySupported in " + committer2)
														
 
															+        .isFalse();
														
 
															+    intercept(IOException.class, "recover",
														
 
															+        () -> committer2.recoverTask(tContext2));
														
 
															+
														
 
															+    // at this point, task attempt 0 has failed to recover
														
 
															+    // it should be abortable though. This will be a no-op as it already
														
 
															+    // committed
														
 
															+    describe("aborting task attempt 2; expect nothing to clean up");
														
 
															+    committer2.abortTask(tContext2);
														
 
															+    describe("Aborting job 2; expect pending commits to be aborted");
														
 
															+    committer2.abortJob(jContext2, JobStatus.State.KILLED);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that the task attempt FS Doesn't have a task attempt
														
 
															+   * directory.
														
 
															+   * @param committer committer
														
 
															+   * @param context task context
														
 
															+   * @throws IOException IO failure.
														
 
															+   */
														
 
															+  protected void assertTaskAttemptPathDoesNotExist(
														
 
															+      ManifestCommitter committer, TaskAttemptContext context)
														
 
															+      throws IOException {
														
 
															+    Path attemptPath = committer.getTaskAttemptPath(context);
														
 
															+    ContractTestUtils.assertPathDoesNotExist(
														
 
															+        attemptPath.getFileSystem(context.getConfiguration()),
														
 
															+        "task attempt dir",
														
 
															+        attemptPath);
														
 
															+  }
														
 
															+
														
 
															+  protected void assertJobAttemptPathDoesNotExist(
														
 
															+      ManifestCommitter committer, JobContext context)
														
 
															+      throws IOException {
														
 
															+    Path attemptPath = committer.getJobAttemptPath(context);
														
 
															+    ContractTestUtils.assertPathDoesNotExist(
														
 
															+        attemptPath.getFileSystem(context.getConfiguration()),
														
 
															+        "job attempt dir",
														
 
															+        attemptPath);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify the output of the directory.
														
 
															+   * That includes the {@code part-m-00000-*}
														
 
															+   * file existence and contents, as well as optionally, the success marker.
														
 
															+   * @param dir directory to scan.
														
 
															+   * @param expectSuccessMarker check the success marker?
														
 
															+   * @param expectedJobId job ID, verified if non-empty and success data loaded
														
 
															+   * @throws Exception failure.
														
 
															+   * @return the success data
														
 
															+   */
														
 
															+  private ManifestSuccessData validateContent(Path dir,
														
 
															+      boolean expectSuccessMarker,
														
 
															+      String expectedJobId) throws Exception {
														
 
															+    lsR(getFileSystem(), dir, true);
														
 
															+    ManifestSuccessData successData;
														
 
															+    if (expectSuccessMarker) {
														
 
															+      successData = verifySuccessMarker(dir, expectedJobId);
														
 
															+    } else {
														
 
															+      successData = null;
														
 
															+    }
														
 
															+    Path expectedFile = getPart0000(dir);
														
 
															+    log().debug("Validating content in {}", expectedFile);
														
 
															+    StringBuilder expectedOutput = new StringBuilder();
														
 
															+    expectedOutput.append(KEY_1).append('\t').append(VAL_1).append("\n");
														
 
															+    expectedOutput.append(VAL_1).append("\n");
														
 
															+    expectedOutput.append(VAL_2).append("\n");
														
 
															+    expectedOutput.append(KEY_2).append("\n");
														
 
															+    expectedOutput.append(KEY_1).append("\n");
														
 
															+    expectedOutput.append(KEY_2).append('\t').append(VAL_2).append("\n");
														
 
															+    String output = readFile(expectedFile);
														
 
															+    Assertions.assertThat(output)
														
 
															+        .describedAs("Content of %s", expectedFile)
														
 
															+        .isEqualTo(expectedOutput.toString());
														
 
															+    return successData;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Identify any path under the directory which begins with the
														
 
															+   * {@code "part-m-00000"} sequence. There's some compensation for
														
 
															+   * eventual consistency here.
														
 
															+   * @param dir directory to scan
														
 
															+   * @return the full path
														
 
															+   * @throws FileNotFoundException the path is missing.
														
 
															+   * @throws Exception failure.
														
 
															+   */
														
 
															+  protected Path getPart0000(final Path dir) throws Exception {
														
 
															+    final FileSystem fs = dir.getFileSystem(getConfiguration());
														
 
															+    FileStatus[] statuses = fs.listStatus(dir,
														
 
															+        path -> path.getName().startsWith(PART_00000));
														
 
															+    if (statuses.length != 1) {
														
 
															+      // fail, with a listing of the parent dir
														
 
															+      ContractTestUtils.assertPathExists(fs, "Output file",
														
 
															+          new Path(dir, PART_00000));
														
 
															+    }
														
 
															+    return statuses[0].getPath();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Look for the partFile subdir of the output dir
														
 
															+   * and the ma and data entries.
														
 
															+   * @param fs filesystem
														
 
															+   * @param dir output dir
														
 
															+   * @throws Exception failure.
														
 
															+   */
														
 
															+  private void validateMapFileOutputContent(
														
 
															+      FileSystem fs, Path dir) throws Exception {
														
 
															+    // map output is a directory with index and data files
														
 
															+    assertPathExists("Map output", dir);
														
 
															+    Path expectedMapDir = getPart0000(dir);
														
 
															+    assertPathExists("Map output", expectedMapDir);
														
 
															+    assertIsDirectory(expectedMapDir);
														
 
															+    FileStatus[] files = fs.listStatus(expectedMapDir);
														
 
															+    Assertions.assertThat(files)
														
 
															+        .as("No files found in " + expectedMapDir)
														
 
															+        .isNotEmpty();
														
 
															+    assertPathExists("index file in " + expectedMapDir,
														
 
															+        new Path(expectedMapDir, MapFile.INDEX_FILE_NAME));
														
 
															+    assertPathExists("data file in " + expectedMapDir,
														
 
															+        new Path(expectedMapDir, MapFile.DATA_FILE_NAME));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Full test of the expected lifecycle: start job, task, write, commit task,
														
 
															+   * commit job.
														
 
															+   * @throws Exception on a failure
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testCommitLifecycle() throws Exception {
														
 
															+    describe("Full test of the expected lifecycle:\n" +
														
 
															+        " start job, task, write, commit task, commit job.\n" +
														
 
															+        "Verify:\n" +
														
 
															+        "* no files are visible after task commit\n" +
														
 
															+        "* the expected file is visible after job commit\n");
														
 
															+    JobData jobData = startJob(false);
														
 
															+    JobContext jContext = jobData.jContext;
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+    assertCommitterFactoryIsManifestCommitter(tContext,
														
 
															+        tContext.getWorkingDirectory());
														
 
															+    validateTaskAttemptWorkingDirectory(committer, tContext);
														
 
															+
														
 
															+    // write output
														
 
															+    describe("1. Writing output");
														
 
															+    final Path textOutputPath = writeTextOutput(tContext);
														
 
															+    describe("Output written to %s", textOutputPath);
														
 
															+
														
 
															+    describe("2. Committing task");
														
 
															+    Assertions.assertThat(committer.needsTaskCommit(tContext))
														
 
															+        .as("No files to commit were found by " + committer)
														
 
															+        .isTrue();
														
 
															+    commitTask(committer, tContext);
														
 
															+    final TaskManifest taskManifest = requireNonNull(
														
 
															+        committer.getTaskAttemptCommittedManifest(), "committerTaskManifest");
														
 
															+    final String manifestJSON = taskManifest.toJson();
														
 
															+    LOG.info("Task manifest {}", manifestJSON);
														
 
															+    int filesCreated = 1;
														
 
															+    Assertions.assertThat(taskManifest.getFilesToCommit())
														
 
															+        .describedAs("Files to commit in task manifest %s", manifestJSON)
														
 
															+        .hasSize(filesCreated);
														
 
															+    Assertions.assertThat(taskManifest.getDestDirectories())
														
 
															+        .describedAs("Directories to create in task manifest %s",
														
 
															+            manifestJSON)
														
 
															+        .isEmpty();
														
 
															+
														
 
															+    // this is only task commit; there MUST be no part- files in the dest dir
														
 
															+    try {
														
 
															+      RemoteIterators.foreach(getFileSystem().listFiles(outputDir, false),
														
 
															+          (status) ->
														
 
															+              Assertions.assertThat(status.getPath().toString())
														
 
															+                  .as("task committed file to dest :" + status)
														
 
															+                  .contains("part"));
														
 
															+    } catch (FileNotFoundException ignored) {
														
 
															+      log().info("Outdir {} is not created by task commit phase ",
														
 
															+          outputDir);
														
 
															+    }
														
 
															+
														
 
															+    describe("3. Committing job");
														
 
															+
														
 
															+    commitJob(committer, jContext);
														
 
															+
														
 
															+    // validate output
														
 
															+    describe("4. Validating content");
														
 
															+    String jobUniqueId = jobData.jobId();
														
 
															+    ManifestSuccessData successData = validateContent(outputDir,
														
 
															+        true,
														
 
															+        jobUniqueId);
														
 
															+    // look in the SUMMARY
														
 
															+    Assertions.assertThat(successData.getDiagnostics())
														
 
															+        .describedAs("Stage entry in SUCCESS")
														
 
															+        .containsEntry(STAGE, OP_STAGE_JOB_COMMIT);
														
 
															+    IOStatisticsSnapshot jobStats = successData.getIOStatistics();
														
 
															+    // manifest
														
 
															+    verifyStatisticCounterValue(jobStats,
														
 
															+        OP_LOAD_MANIFEST, 1);
														
 
															+    FileStatus st = getFileSystem().getFileStatus(getPart0000(outputDir));
														
 
															+    verifyStatisticCounterValue(jobStats,
														
 
															+        COMMITTER_FILES_COMMITTED_COUNT, filesCreated);
														
 
															+    verifyStatisticCounterValue(jobStats,
														
 
															+        COMMITTER_BYTES_COMMITTED_COUNT, st.getLen());
														
 
															+
														
 
															+    // now load and examine the job report.
														
 
															+    // this MUST contain all the stats of the summary, plus timings on
														
 
															+    // job commit itself
														
 
															+
														
 
															+    ManifestSuccessData report = loadReport(jobUniqueId, true);
														
 
															+    Map<String, String> diag = report.getDiagnostics();
														
 
															+    Assertions.assertThat(diag)
														
 
															+        .describedAs("Stage entry in report")
														
 
															+        .containsEntry(STAGE, OP_STAGE_JOB_COMMIT);
														
 
															+    IOStatisticsSnapshot reportStats = report.getIOStatistics();
														
 
															+    verifyStatisticCounterValue(reportStats,
														
 
															+        OP_LOAD_MANIFEST, 1);
														
 
															+    verifyStatisticCounterValue(reportStats,
														
 
															+        OP_STAGE_JOB_COMMIT, 1);
														
 
															+    verifyStatisticCounterValue(reportStats,
														
 
															+        COMMITTER_FILES_COMMITTED_COUNT, filesCreated);
														
 
															+    verifyStatisticCounterValue(reportStats,
														
 
															+        COMMITTER_BYTES_COMMITTED_COUNT, st.getLen());
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Load a summary from the report dir.
														
 
															+   * @param jobUniqueId job ID
														
 
															+   * @param expectSuccess is the job expected to have succeeded.
														
 
															+   * @throws IOException failure to load
														
 
															+   * @return the report
														
 
															+   */
														
 
															+  private ManifestSuccessData loadReport(String jobUniqueId,
														
 
															+      boolean expectSuccess) throws IOException {
														
 
															+    File file = new File(getReportDir(),
														
 
															+        createJobSummaryFilename(jobUniqueId));
														
 
															+    ContractTestUtils.assertIsFile(FileSystem.getLocal(getConfiguration()),
														
 
															+        new Path(file.toURI()));
														
 
															+    ManifestSuccessData report = ManifestSuccessData.serializer().load(file);
														
 
															+    LOG.info("Report for job {}:\n{}", jobUniqueId, report.toJson());
														
 
															+    Assertions.assertThat(report.getSuccess())
														
 
															+        .describedAs("success flag in report")
														
 
															+        .isEqualTo(expectSuccess);
														
 
															+    return report;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Repeated commit call after job commit.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testCommitterWithDuplicatedCommit() throws Exception {
														
 
															+    describe("Call a task then job commit twice;" +
														
 
															+        "expect the second task commit to fail.");
														
 
															+    JobData jobData = startJob(true);
														
 
															+    JobContext jContext = jobData.jContext;
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+
														
 
															+    // do commit
														
 
															+    describe("committing task");
														
 
															+    committer.commitTask(tContext);
														
 
															+
														
 
															+    // repeated commit while TA dir exists fine/idempotent
														
 
															+    committer.commitTask(tContext);
														
 
															+
														
 
															+    describe("committing job");
														
 
															+    committer.commitJob(jContext);
														
 
															+    describe("commit complete\n");
														
 
															+
														
 
															+    describe("cleanup");
														
 
															+    committer.cleanupJob(jContext);
														
 
															+    // validate output
														
 
															+    validateContent(outputDir, shouldExpectSuccessMarker(),
														
 
															+        committer.getJobUniqueId());
														
 
															+
														
 
															+    // commit task to fail on retry as task attempt dir doesn't exist
														
 
															+    describe("Attempting commit of the same task after job commit -expecting failure");
														
 
															+    expectFNFEonTaskCommit(committer, tContext);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * HADOOP-17258. If a second task attempt is committed, it
														
 
															+   * must succeed, and the output of the first TA, even if already
														
 
															+   * committed, MUST NOT be visible in the final output.
														
 
															+   * <p></p>
														
 
															+   * What's important is not just that only one TA must succeed,
														
 
															+   * but it must be the last one executed.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testTwoTaskAttemptsCommit() throws Exception {
														
 
															+    describe("Commit two task attempts;" +
														
 
															+        " expect the second attempt to succeed.");
														
 
															+    JobData jobData = startJob(false);
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+    // do commit
														
 
															+    describe("\ncommitting task");
														
 
															+    // write output for TA 1,
														
 
															+    Path outputTA1 = writeTextOutput(tContext);
														
 
															+
														
 
															+    // speculatively execute committer 2.
														
 
															+
														
 
															+    // jobconf with a different base to its parts.
														
 
															+    Configuration conf2 = jobData.conf;
														
 
															+    conf2.set("mapreduce.output.basename", "attempt2");
														
 
															+    String attempt2 = "attempt_" + jobId + "_m_000000_1";
														
 
															+    TaskAttemptID ta2 = TaskAttemptID.forName(attempt2);
														
 
															+    TaskAttemptContext tContext2 = new TaskAttemptContextImpl(
														
 
															+        conf2, ta2);
														
 
															+
														
 
															+    ManifestCommitter committer2 = localCommitterFactory
														
 
															+        .createCommitter(tContext2);
														
 
															+    setupCommitter(committer2, tContext2);
														
 
															+
														
 
															+    // verify working dirs are different
														
 
															+    Assertions.assertThat(committer.getWorkPath())
														
 
															+        .describedAs("Working dir of %s", committer)
														
 
															+        .isNotEqualTo(committer2.getWorkPath());
														
 
															+
														
 
															+    // write output for TA 2,
														
 
															+    Path outputTA2 = writeTextOutput(tContext2);
														
 
															+
														
 
															+    // verify the names are different.
														
 
															+    String name1 = outputTA1.getName();
														
 
															+    String name2 = outputTA2.getName();
														
 
															+    Assertions.assertThat(name1)
														
 
															+        .describedAs("name of task attempt output %s", outputTA1)
														
 
															+        .isNotEqualTo(name2);
														
 
															+
														
 
															+    // commit task 1
														
 
															+    committer.commitTask(tContext);
														
 
															+
														
 
															+    // then pretend that task1 didn't respond, so
														
 
															+    // commit task 2
														
 
															+    committer2.commitTask(tContext2);
														
 
															+
														
 
															+    // and the job
														
 
															+    committer2.commitJob(tContext);
														
 
															+
														
 
															+    // validate output
														
 
															+    FileSystem fs = getFileSystem();
														
 
															+    ManifestSuccessData successData = validateSuccessFile(fs, outputDir,
														
 
															+        1,
														
 
															+        "");
														
 
															+    Assertions.assertThat(successData.getFilenames())
														
 
															+        .describedAs("Files committed")
														
 
															+        .hasSize(1);
														
 
															+
														
 
															+    assertPathExists("attempt2 output", new Path(outputDir, name2));
														
 
															+    assertPathDoesNotExist("attempt1 output", new Path(outputDir, name1));
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  protected boolean shouldExpectSuccessMarker() {
														
 
															+    return true;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Simulate a failure on the first job commit; expect the
														
 
															+   * second to succeed.
														
 
															+   */
														
 
															+  /*@Test
														
 
															+  public void testCommitterWithFailure() throws Exception {
														
 
															+    describe("Fail the first job commit then retry");
														
 
															+    JobData jobData = startJob(new FailingCommitterFactory(), true);
														
 
															+    JobContext jContext = jobData.jContext;
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+
														
 
															+    // do commit
														
 
															+    committer.commitTask(tContext);
														
 
															+
														
 
															+    // now fail job
														
 
															+    expectSimulatedFailureOnJobCommit(jContext, committer);
														
 
															+
														
 
															+    commitJob(committer, jContext);
														
 
															+
														
 
															+    // but the data got there, due to the order of operations.
														
 
															+    validateContent(outDir, shouldExpectSuccessMarker(),
														
 
															+        committer.getUUID());
														
 
															+    expectJobCommitToFail(jContext, committer);
														
 
															+  }
														
 
															+*/
														
 
															+
														
 
															+  /**
														
 
															+   * Override point: the failure expected on the attempt to commit a failed
														
 
															+   * job.
														
 
															+   * @param jContext job context
														
 
															+   * @param committer committer
														
 
															+   * @throws Exception any unexpected failure.
														
 
															+   */
														
 
															+  protected void expectJobCommitToFail(JobContext jContext,
														
 
															+      ManifestCommitter committer) throws Exception {
														
 
															+    // next attempt will fail as there is no longer a directory to commit
														
 
															+    expectJobCommitFailure(jContext, committer,
														
 
															+        FileNotFoundException.class);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Expect a job commit operation to fail with a specific exception.
														
 
															+   * @param jContext job context
														
 
															+   * @param committer committer
														
 
															+   * @param clazz class of exception
														
 
															+   * @return the caught exception
														
 
															+   * @throws Exception any unexpected failure.
														
 
															+   */
														
 
															+  protected static <E extends IOException> E expectJobCommitFailure(
														
 
															+      JobContext jContext,
														
 
															+      ManifestCommitter committer,
														
 
															+      Class<E> clazz)
														
 
															+      throws Exception {
														
 
															+
														
 
															+    return intercept(clazz,
														
 
															+        () -> {
														
 
															+          committer.commitJob(jContext);
														
 
															+          return committer.toString();
														
 
															+        });
														
 
															+  }
														
 
															+
														
 
															+  protected static void expectFNFEonTaskCommit(
														
 
															+      ManifestCommitter committer,
														
 
															+      TaskAttemptContext tContext) throws Exception {
														
 
															+    intercept(FileNotFoundException.class,
														
 
															+        () -> {
														
 
															+          committer.commitTask(tContext);
														
 
															+          return committer.toString();
														
 
															+        });
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Commit a task with no output.
														
 
															+   * Dest dir should exist.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testCommitterWithNoOutputs() throws Exception {
														
 
															+    describe("Have a task and job with no outputs: expect success");
														
 
															+    JobData jobData = startJob(localCommitterFactory, false);
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+
														
 
															+    // do commit
														
 
															+    committer.commitTask(tContext);
														
 
															+    Path attemptPath = committer.getTaskAttemptPath(tContext);
														
 
															+    ContractTestUtils.assertPathExists(
														
 
															+        attemptPath.getFileSystem(tContext.getConfiguration()),
														
 
															+        "task attempt dir",
														
 
															+        attemptPath);
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  @Test
														
 
															+  public void testMapFileOutputCommitter() throws Exception {
														
 
															+    describe("Test that the committer generates map output into a directory\n" +
														
 
															+        "starting with the prefix part-");
														
 
															+    JobData jobData = startJob(false);
														
 
															+    JobContext jContext = jobData.jContext;
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+    Configuration conf = jobData.conf;
														
 
															+
														
 
															+    // write output
														
 
															+    writeMapFileOutput(new MapFileOutputFormat()
														
 
															+            .getRecordWriter(tContext), tContext);
														
 
															+
														
 
															+    // do commit
														
 
															+    commitTaskAndJob(committer, jContext, tContext);
														
 
															+    FileSystem fs = getFileSystem();
														
 
															+
														
 
															+    lsR(fs, outputDir, true);
														
 
															+    String ls = ls(outputDir);
														
 
															+    describe("\nvalidating");
														
 
															+
														
 
															+    // validate output
														
 
															+    verifySuccessMarker(outputDir, committer.getJobUniqueId());
														
 
															+
														
 
															+    describe("validate output of %s", outputDir);
														
 
															+    validateMapFileOutputContent(fs, outputDir);
														
 
															+
														
 
															+    // Ensure getReaders call works and also ignores
														
 
															+    // hidden filenames (_ or . prefixes)
														
 
															+    describe("listing");
														
 
															+    FileStatus[] filtered = fs.listStatus(outputDir, HIDDEN_FILE_FILTER);
														
 
															+    Assertions.assertThat(filtered)
														
 
															+        .describedAs("listed children under %s", ls)
														
 
															+        .hasSize(1);
														
 
															+    FileStatus fileStatus = filtered[0];
														
 
															+    Assertions.assertThat(fileStatus.getPath().getName())
														
 
															+        .as("Not the part file: " + fileStatus)
														
 
															+        .startsWith(PART_00000);
														
 
															+
														
 
															+    describe("getReaders()");
														
 
															+    Assertions.assertThat(getReaders(fs, outputDir, conf))
														
 
															+        .describedAs("getReaders() MapFile.Reader entries with shared FS %s %s", outputDir, ls)
														
 
															+        .hasSize(1);
														
 
															+
														
 
															+    describe("getReaders(new FS)");
														
 
															+    FileSystem fs2 = FileSystem.get(outputDir.toUri(), conf);
														
 
															+    Assertions.assertThat(getReaders(fs2, outputDir, conf))
														
 
															+        .describedAs("getReaders(new FS) %s %s", outputDir, ls)
														
 
															+        .hasSize(1);
														
 
															+
														
 
															+    describe("MapFileOutputFormat.getReaders");
														
 
															+    Assertions.assertThat(MapFileOutputFormat.getReaders(outputDir, conf))
														
 
															+        .describedAs("MapFileOutputFormat.getReaders(%s) %s", outputDir, ls)
														
 
															+        .hasSize(1);
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /** Open the output generated by this format. */
														
 
															+  @SuppressWarnings("IOResourceOpenedButNotSafelyClosed")
														
 
															+  private static MapFile.Reader[] getReaders(FileSystem fs,
														
 
															+      Path dir,
														
 
															+      Configuration conf) throws IOException {
														
 
															+    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, HIDDEN_FILE_FILTER));
														
 
															+
														
 
															+    // sort names, so that hash partitioning works
														
 
															+    Arrays.sort(names);
														
 
															+
														
 
															+    MapFile.Reader[] parts = new MapFile.Reader[names.length];
														
 
															+    for (int i = 0; i < names.length; i++) {
														
 
															+      parts[i] = new MapFile.Reader(names[i], conf);
														
 
															+    }
														
 
															+    return parts;
														
 
															+  }
														
 
															+
														
 
															+  public static final PathFilter HIDDEN_FILE_FILTER = (path) ->
														
 
															+      !path.getName().startsWith("_") && !path.getName().startsWith(".");
														
 
															+
														
 
															+  /**
														
 
															+   * A functional interface which an action to test must implement.
														
 
															+   */
														
 
															+  @FunctionalInterface
														
 
															+  public interface ActionToTest {
														
 
															+
														
 
															+    void exec(Job job, JobContext jContext, TaskAttemptContext tContext,
														
 
															+        ManifestCommitter committer) throws Exception;
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testAbortTaskNoWorkDone() throws Exception {
														
 
															+    executeWork("abort task no work",
														
 
															+        (job, jContext, tContext, committer) ->
														
 
															+            committer.abortTask(tContext));
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testAbortJobNoWorkDone() throws Exception {
														
 
															+    executeWork("abort task no work",
														
 
															+        (job, jContext, tContext, committer) ->
														
 
															+            committer.abortJob(jContext, JobStatus.State.RUNNING));
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testCommitJobButNotTask() throws Exception {
														
 
															+    executeWork("commit a job while a task's work is pending, " +
														
 
															+            "expect task writes to be cancelled.",
														
 
															+        (job, jContext, tContext, committer) -> {
														
 
															+          // step 1: write the text
														
 
															+          writeTextOutput(tContext);
														
 
															+          // step 2: commit the job
														
 
															+          createCommitter(tContext).commitJob(tContext);
														
 
															+          // verify that no output can be observed
														
 
															+          assertPart0000DoesNotExist(outputDir);
														
 
															+        }
														
 
															+    );
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testAbortTaskThenJob() throws Exception {
														
 
															+    JobData jobData = startJob(true);
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+
														
 
															+    // do abort
														
 
															+    committer.abortTask(jobData.tContext);
														
 
															+
														
 
															+    intercept(FileNotFoundException.class, "",
														
 
															+        () -> getPart0000(committer.getWorkPath()));
														
 
															+
														
 
															+    committer.abortJob(jobData.jContext, JobStatus.State.FAILED);
														
 
															+    assertJobAbortCleanedUp(jobData);
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Extension point: assert that the job was all cleaned up after an abort.
														
 
															+   * Base assertions
														
 
															+   * <ul>
														
 
															+   *   <li>Output dir is absent or, if present, empty</li>
														
 
															+   * </ul>
														
 
															+   * @param jobData job data
														
 
															+   * @throws Exception failure
														
 
															+   */
														
 
															+  public void assertJobAbortCleanedUp(JobData jobData) throws Exception {
														
 
															+    FileSystem fs = getFileSystem();
														
 
															+    try {
														
 
															+      FileStatus[] children = listChildren(fs, outputDir);
														
 
															+      if (children.length != 0) {
														
 
															+        lsR(fs, outputDir, true);
														
 
															+      }
														
 
															+      Assertions.assertThat(children)
														
 
															+          .as("Output directory not empty " + ls(outputDir))
														
 
															+          .containsExactly(new FileStatus[0]);
														
 
															+    } catch (FileNotFoundException e) {
														
 
															+      // this is a valid state; it means the dest dir doesn't exist yet.
														
 
															+    }
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testFailAbort() throws Exception {
														
 
															+    describe("Abort the task, then job (failed), abort the job again");
														
 
															+    JobData jobData = startJob(true);
														
 
															+    JobContext jContext = jobData.jContext;
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+    ManifestCommitter committer = jobData.committer;
														
 
															+
														
 
															+    // do abort
														
 
															+    committer.abortTask(tContext);
														
 
															+
														
 
															+    committer.getJobAttemptPath(jContext);
														
 
															+    committer.getTaskAttemptPath(tContext);
														
 
															+    assertPart0000DoesNotExist(outputDir);
														
 
															+    assertSuccessMarkerDoesNotExist(outputDir);
														
 
															+    describe("Aborting job into %s", outputDir);
														
 
															+
														
 
															+    committer.abortJob(jContext, JobStatus.State.FAILED);
														
 
															+
														
 
															+    assertTaskAttemptPathDoesNotExist(committer, tContext);
														
 
															+    assertJobAttemptPathDoesNotExist(committer, jContext);
														
 
															+
														
 
															+    // verify a failure report
														
 
															+    ManifestSuccessData report = loadReport(jobData.jobId(), false);
														
 
															+    Map<String, String> diag = report.getDiagnostics();
														
 
															+    Assertions.assertThat(diag)
														
 
															+        .describedAs("Stage entry in report")
														
 
															+        .containsEntry(STAGE, OP_STAGE_JOB_ABORT);
														
 
															+    IOStatisticsSnapshot reportStats = report.getIOStatistics();
														
 
															+    verifyStatisticCounterValue(reportStats,
														
 
															+        OP_STAGE_JOB_ABORT, 1);
														
 
															+
														
 
															+    // try again; expect abort to be idempotent.
														
 
															+    committer.abortJob(jContext, JobStatus.State.FAILED);
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that the given dir does not have the {@code _SUCCESS} marker.
														
 
															+   * @param dir dir to scan
														
 
															+   * @throws IOException IO Failure
														
 
															+   */
														
 
															+  protected void assertSuccessMarkerDoesNotExist(Path dir) throws IOException {
														
 
															+    assertPathDoesNotExist("Success marker",
														
 
															+        new Path(dir, SUCCESS_MARKER));
														
 
															+  }
														
 
															+
														
 
															+  public void assertPart0000DoesNotExist(Path dir) throws Exception {
														
 
															+    intercept(FileNotFoundException.class,
														
 
															+        () -> getPart0000(dir));
														
 
															+    assertPathDoesNotExist("expected output file", new Path(dir, PART_00000));
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testAbortJobNotTask() throws Exception {
														
 
															+    executeWork("abort task no work",
														
 
															+        (job, jContext, tContext, committer) -> {
														
 
															+          // write output
														
 
															+          writeTextOutput(tContext);
														
 
															+          committer.abortJob(jContext, JobStatus.State.RUNNING);
														
 
															+          assertTaskAttemptPathDoesNotExist(
														
 
															+              committer, tContext);
														
 
															+          assertJobAttemptPathDoesNotExist(
														
 
															+              committer, jContext);
														
 
															+        });
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * This looks at what happens with concurrent commits.
														
 
															+   * However, the failure condition it looks for (subdir under subdir)
														
 
															+   * is the kind of failure you see on a rename-based commit.
														
 
															+   *
														
 
															+   * What it will not detect is the fact that both tasks will each commit
														
 
															+   * to the destination directory. That is: whichever commits last wins.
														
 
															+   *
														
 
															+   * There's no way to stop this. Instead it is a requirement that the task
														
 
															+   * commit operation is only executed when the committer is happy to
														
 
															+   * commit only those tasks which it knows have succeeded, and abort those
														
 
															+   * which have not.
														
 
															+   * @throws Exception failure
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testConcurrentCommitTaskWithSubDir() throws Exception {
														
 
															+    Job job = newJob();
														
 
															+    FileOutputFormat.setOutputPath(job, outputDir);
														
 
															+    final Configuration conf = job.getConfiguration();
														
 
															+
														
 
															+    final JobContext jContext =
														
 
															+        new JobContextImpl(conf, taskAttempt0.getJobID());
														
 
															+    ManifestCommitter amCommitter = createCommitter(
														
 
															+        new TaskAttemptContextImpl(conf, taskAttempt0));
														
 
															+    amCommitter.setupJob(jContext);
														
 
															+
														
 
															+    final TaskAttemptContext[] taCtx = new TaskAttemptContextImpl[2];
														
 
															+    taCtx[0] = new TaskAttemptContextImpl(conf, taskAttempt0);
														
 
															+    taCtx[1] = new TaskAttemptContextImpl(conf, taskAttempt1);
														
 
															+
														
 
															+    // IDE/checkstyle complain here about type casting but they
														
 
															+    // are confused.
														
 
															+    final TextOutputFormat<Writable, Object>[] tof =
														
 
															+        new TextOutputForTests[2];
														
 
															+
														
 
															+    for (int i = 0; i < tof.length; i++) {
														
 
															+      tof[i] = new TextOutputForTests<Writable, Object>() {
														
 
															+        @Override
														
 
															+        public Path getDefaultWorkFile(
														
 
															+            TaskAttemptContext context,
														
 
															+            String extension) throws IOException {
														
 
															+          final ManifestCommitter foc = (ManifestCommitter)
														
 
															+              getOutputCommitter(context);
														
 
															+          return new Path(new Path(foc.getWorkPath(), SUB_DIR),
														
 
															+              getUniqueFile(context, getOutputName(context), extension));
														
 
															+        }
														
 
															+      };
														
 
															+    }
														
 
															+
														
 
															+    final ExecutorService executor = HadoopExecutors.newFixedThreadPool(2);
														
 
															+    try {
														
 
															+      for (int i = 0; i < taCtx.length; i++) {
														
 
															+        final int taskIdx = i;
														
 
															+        executor.submit(() -> {
														
 
															+          final OutputCommitter outputCommitter =
														
 
															+              tof[taskIdx].getOutputCommitter(taCtx[taskIdx]);
														
 
															+          outputCommitter.setupTask(taCtx[taskIdx]);
														
 
															+          writeOutput(tof[taskIdx].getRecordWriter(taCtx[taskIdx]), taCtx[taskIdx]);
														
 
															+          describe("Committing Task %d", taskIdx);
														
 
															+          outputCommitter.commitTask(taCtx[taskIdx]);
														
 
															+          return null;
														
 
															+        });
														
 
															+      }
														
 
															+    } finally {
														
 
															+      executor.shutdown();
														
 
															+      while (!executor.awaitTermination(1, TimeUnit.SECONDS)) {
														
 
															+        log().info("Awaiting thread termination!");
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    // if we commit here then all tasks will be committed, so there will
														
 
															+    // be contention for that final directory: both parts will go in.
														
 
															+
														
 
															+    describe("\nCommitting Job");
														
 
															+    amCommitter.commitJob(jContext);
														
 
															+    assertPathExists("base output directory", outputDir);
														
 
															+    assertPart0000DoesNotExist(outputDir);
														
 
															+    Path outSubDir = new Path(outputDir, SUB_DIR);
														
 
															+    assertPathDoesNotExist("Must not end up with sub_dir/sub_dir",
														
 
															+        new Path(outSubDir, SUB_DIR));
														
 
															+
														
 
															+    // validate output
														
 
															+    // There's no success marker in the subdirectory
														
 
															+    validateContent(outSubDir, false, "");
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testUnsupportedSchema() throws Throwable {
														
 
															+    intercept(PathIOException.class, () ->
														
 
															+        new ManifestCommitterFactory()
														
 
															+            .createOutputCommitter(new Path("s3a://unsupported/"), null));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Factory for failing committers.
														
 
															+   */
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+  protected ManifestCommitter createFailingCommitter(
														
 
															+  final TaskAttemptContext tContext)
														
 
															+      throws IOException {
														
 
															+    //     TODO
														
 
															+    return null;
														
 
															+  }
														
 
															+
														
 
															+  public class FailingCommitterFactory implements CommitterFactory {
														
 
															+
														
 
															+    @Override
														
 
															+    public ManifestCommitter createCommitter(TaskAttemptContext context)
														
 
															+        throws IOException {
														
 
															+      return createFailingCommitter(context);
														
 
															+    }
														
 
															+  }*/
														
 
															+  @Test
														
 
															+  public void testOutputFormatIntegration() throws Throwable {
														
 
															+    Configuration conf = getConfiguration();
														
 
															+    Job job = newJob();
														
 
															+    assertCommitterFactoryIsManifestCommitter(job, outputDir);
														
 
															+    job.setOutputFormatClass(TextOutputForTests.class);
														
 
															+    conf = job.getConfiguration();
														
 
															+    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0);
														
 
															+    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 1);
														
 
															+    JobContext jContext = new JobContextImpl(conf, taskAttempt0.getJobID());
														
 
															+    TaskAttemptContext tContext = new TaskAttemptContextImpl(conf,
														
 
															+        taskAttempt0);
														
 
															+    TextOutputForTests<IntWritable, IntWritable> outputFormat =
														
 
															+        (TextOutputForTests<IntWritable, IntWritable>)
														
 
															+            ReflectionUtils.newInstance(tContext.getOutputFormatClass(), conf);
														
 
															+    ManifestCommitter committer = (ManifestCommitter)
														
 
															+        outputFormat.getOutputCommitter(tContext);
														
 
															+
														
 
															+    // setup
														
 
															+    JobData jobData = new JobData(job, jContext, tContext, committer);
														
 
															+    setupJob(jobData);
														
 
															+    abortInTeardown(jobData);
														
 
															+    TextOutputForTests.LoggingLineRecordWriter<IntWritable, IntWritable> recordWriter
														
 
															+        = outputFormat.getRecordWriter(tContext);
														
 
															+    IntWritable iw = new IntWritable(1);
														
 
															+    recordWriter.write(iw, iw);
														
 
															+    long expectedLength = 4;
														
 
															+    Path dest = recordWriter.getDest();
														
 
															+    validateTaskAttemptPathDuringWrite(dest, expectedLength);
														
 
															+    recordWriter.close(tContext);
														
 
															+    // at this point
														
 
															+    validateTaskAttemptPathAfterWrite(dest, expectedLength);
														
 
															+    Assertions.assertThat(committer.needsTaskCommit(tContext))
														
 
															+        .as("Committer does not have data to commit " + committer)
														
 
															+        .isTrue();
														
 
															+    commitTask(committer, tContext);
														
 
															+    // at this point the committer tasks stats should be current.
														
 
															+    IOStatisticsSnapshot snapshot = new IOStatisticsSnapshot(
														
 
															+        committer.getIOStatistics());
														
 
															+    String commitsCompleted = COMMITTER_TASKS_COMPLETED_COUNT;
														
 
															+    LOG.info("after task commit {}", ioStatisticsToPrettyString(snapshot));
														
 
															+    verifyStatisticCounterValue(snapshot,
														
 
															+        commitsCompleted, 1);
														
 
															+    final TaskManifest manifest = loadManifest(
														
 
															+        committer.getTaskManifestPath(tContext));
														
 
															+    LOG.info("Manifest {}", manifest.toJson());
														
 
															+
														
 
															+    commitJob(committer, jContext);
														
 
															+    LOG.info("committer iostatistics {}",
														
 
															+        ioStatisticsSourceToString(committer));
														
 
															+
														
 
															+    // validate output
														
 
															+    ManifestSuccessData successData = verifySuccessMarker(outputDir,
														
 
															+        committer.getJobUniqueId());
														
 
															+
														
 
															+    // the task commit count should get through the job commit
														
 
															+    IOStatisticsSnapshot successStats = successData.getIOStatistics();
														
 
															+    LOG.info("loaded statistics {}", successStats);
														
 
															+    verifyStatisticCounterValue(successStats,
														
 
															+        commitsCompleted, 1);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a committer through reflection then use it to abort
														
 
															+   * a task. This mimics the action of an AM when a container fails and
														
 
															+   * the AM wants to abort the task attempt.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testAMWorkflow() throws Throwable {
														
 
															+    describe("Create a committer with a null output path & use as an AM");
														
 
															+    JobData jobData = startJob(true);
														
 
															+    JobContext jContext = jobData.jContext;
														
 
															+    TaskAttemptContext tContext = jobData.tContext;
														
 
															+
														
 
															+    TaskAttemptContext newAttempt = new TaskAttemptContextImpl(
														
 
															+        jContext.getConfiguration(),
														
 
															+        taskAttempt0);
														
 
															+    Configuration conf = jContext.getConfiguration();
														
 
															+
														
 
															+    // bind
														
 
															+    TextOutputForTests.bind(conf);
														
 
															+
														
 
															+    OutputFormat<?, ?> outputFormat
														
 
															+        = ReflectionUtils.newInstance(newAttempt.getOutputFormatClass(), conf);
														
 
															+    Path outputPath = FileOutputFormat.getOutputPath(newAttempt);
														
 
															+    Assertions.assertThat(outputPath)
														
 
															+        .as("null output path in new task attempt")
														
 
															+        .isNotNull();
														
 
															+
														
 
															+    ManifestCommitter committer2 = (ManifestCommitter)
														
 
															+        outputFormat.getOutputCommitter(newAttempt);
														
 
															+    committer2.abortTask(tContext);
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Make sure that two jobs in parallel directory trees coexist.
														
 
															+   * Note: the two jobs are not trying to write to the same
														
 
															+   * output directory.
														
 
															+   * That should be possible, but cleanup must be disabled.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testParallelJobsToAdjacentPaths() throws Throwable {
														
 
															+
														
 
															+    describe("Run two jobs in parallel, assert they both complete");
														
 
															+    JobData jobData = startJob(true);
														
 
															+    Job job1 = jobData.job;
														
 
															+    ManifestCommitter committer1 = jobData.committer;
														
 
															+    JobContext jContext1 = jobData.jContext;
														
 
															+    TaskAttemptContext tContext1 = jobData.tContext;
														
 
															+
														
 
															+    // now build up a second job
														
 
															+    String jobId2 = randomJobId();
														
 
															+    String attempt20 = "attempt_" + jobId2 + "_m_000000_0";
														
 
															+    TaskAttemptID taskAttempt20 = TaskAttemptID.forName(attempt20);
														
 
															+    String attempt21 = "attempt_" + jobId2 + "_m_000001_0";
														
 
															+    TaskAttemptID taskAttempt21 = TaskAttemptID.forName(attempt21);
														
 
															+
														
 
															+    Path job1Dest = outputDir;
														
 
															+    Path job2Dest = new Path(getOutputDir().getParent(),
														
 
															+        getMethodName() + "job2Dest");
														
 
															+    // little safety check
														
 
															+    Assertions.assertThat(job2Dest)
														
 
															+        .describedAs("Job destinations")
														
 
															+        .isNotEqualTo(job1Dest);
														
 
															+
														
 
															+    // create the second job
														
 
															+    Job job2 = newJob(job2Dest,
														
 
															+        unsetUUIDOptions(new JobConf(getConfiguration())),
														
 
															+        attempt20);
														
 
															+    Configuration conf2 = job2.getConfiguration();
														
 
															+    conf2.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 1);
														
 
															+    ManifestCommitter committer2 = null;
														
 
															+    try {
														
 
															+      JobContext jContext2 = new JobContextImpl(conf2,
														
 
															+          taskAttempt20.getJobID());
														
 
															+      TaskAttemptContext tContext2 =
														
 
															+          new TaskAttemptContextImpl(conf2, taskAttempt20);
														
 
															+      committer2 = createCommitter(job2Dest, tContext2);
														
 
															+      JobData jobData2 = new JobData(job2, jContext2, tContext2, committer2);
														
 
															+      setupJob(jobData2);
														
 
															+      abortInTeardown(jobData2);
														
 
															+      // make sure the directories are different
														
 
															+      Assertions.assertThat(committer1.getOutputPath())
														
 
															+          .describedAs("Committer output path of %s and %s", committer1, committer2)
														
 
															+          .isNotEqualTo(committer2.getOutputPath());
														
 
															+      // and job IDs
														
 
															+      Assertions.assertThat(committer1.getJobUniqueId())
														
 
															+          .describedAs("JobUnique IDs of %s and %s", committer1, committer2)
														
 
															+          .isNotEqualTo(committer2.getJobUniqueId());
														
 
															+
														
 
															+      // job2 setup, write some data there
														
 
															+      writeTextOutput(tContext2);
														
 
															+
														
 
															+      // at this point, job1 and job2 both have uncommitted tasks
														
 
															+
														
 
															+      // commit tasks in order task 2, task 1.
														
 
															+      commitTask(committer2, tContext2);
														
 
															+      commitTask(committer1, tContext1);
														
 
															+
														
 
															+      // commit jobs in order job 1, job 2
														
 
															+      commitJob(committer1, jContext1);
														
 
															+
														
 
															+      getPart0000(job1Dest);
														
 
															+
														
 
															+      commitJob(committer2, jContext2);
														
 
															+      getPart0000(job2Dest);
														
 
															+
														
 
															+    } finally {
														
 
															+      // clean things up in test failures.
														
 
															+      FileSystem fs = getFileSystem();
														
 
															+      if (committer1 != null) {
														
 
															+        fs.delete(committer1.getOutputPath(), true);
														
 
															+      }
														
 
															+      if (committer2 != null) {
														
 
															+        fs.delete(committer2.getOutputPath(), true);
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Strip staging/spark UUID options.
														
 
															+   * @param conf config
														
 
															+   * @return the patched config
														
 
															+   */
														
 
															+  protected Configuration unsetUUIDOptions(final Configuration conf) {
														
 
															+    conf.unset(SPARK_WRITE_UUID);
														
 
															+    return conf;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Assert that a committer's job attempt path exists.
														
 
															+   * For the staging committers, this is in the cluster FS.
														
 
															+   * @param committer committer
														
 
															+   * @param jobContext job context
														
 
															+   * @throws IOException failure
														
 
															+   */
														
 
															+  protected void assertJobAttemptPathExists(
														
 
															+      final ManifestCommitter committer,
														
 
															+      final JobContext jobContext) throws IOException {
														
 
															+    Path attemptPath = committer.getJobAttemptPath(jobContext);
														
 
															+    ContractTestUtils.assertIsDirectory(
														
 
															+        attemptPath.getFileSystem(committer.getConf()),
														
 
															+        attemptPath);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Validate the path of a file being written to during the write
														
 
															+   * itself.
														
 
															+   * @param p path
														
 
															+   * @param expectedLength
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  protected void validateTaskAttemptPathDuringWrite(Path p,
														
 
															+      final long expectedLength) throws IOException {
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Validate the path of a file being written to after the write
														
 
															+   * operation has completed.
														
 
															+   * @param p path
														
 
															+   * @param expectedLength
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  protected void validateTaskAttemptPathAfterWrite(Path p,
														
 
															+      final long expectedLength) throws IOException {
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Perform any actions needed to validate the working directory of
														
 
															+   * a committer.
														
 
															+   * For example: filesystem, path attributes
														
 
															+   * @param committer committer instance
														
 
															+   * @param context task attempt context
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  protected void validateTaskAttemptWorkingDirectory(
														
 
															+      ManifestCommitter committer,
														
 
															+      TaskAttemptContext context) throws IOException {
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Commit a task then validate the state of the committer afterwards.
														
 
															+   * @param committer committer
														
 
															+   * @param tContext task context
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  protected void commitTask(final ManifestCommitter committer,
														
 
															+      final TaskAttemptContext tContext) throws IOException {
														
 
															+    committer.commitTask(tContext);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Commit a job then validate the state of the committer afterwards.
														
 
															+   * @param committer committer
														
 
															+   * @param jContext job context
														
 
															+   * @throws IOException IO failure
														
 
															+   */
														
 
															+  protected void commitJob(final ManifestCommitter committer,
														
 
															+      final JobContext jContext) throws IOException {
														
 
															+    committer.commitJob(jContext);
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestRenameStageFailure.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestRenameStageFailure.java
@@ -0,0 +1,379 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.FileNotFoundException;
														
 
															+import java.io.IOException;
														
 
															+import java.nio.charset.StandardCharsets;
														
 
															+import java.util.ArrayList;
														
 
															+import java.util.Collections;
														
 
															+import java.util.List;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+import org.junit.Assume;
														
 
															+import org.junit.Test;
														
 
															+
														
 
															+import org.apache.commons.lang3.tuple.Pair;
														
 
															+import org.apache.hadoop.fs.CommonPathCapabilities;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.PathIOException;
														
 
															+import org.apache.hadoop.fs.contract.ContractTestUtils;
														
 
															+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.RenameFilesStage;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
														
 
															+
														
 
															+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
														
 
															+import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyFileContents;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertThatStatisticCounter;
														
 
															+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.getEtag;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations.SIMULATED_FAILURE;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.AbstractJobOrTaskStage.FAILED_TO_RENAME_PREFIX;
														
 
															+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
														
 
															+
														
 
															+/**
														
 
															+ * Test renaming files with fault injection.
														
 
															+ * This explores etag support and overwrite-on-rename semantics
														
 
															+ * of the target FS, so some of the tests behave differently
														
 
															+ * on different stores.
														
 
															+ */
														
 
															+public class TestRenameStageFailure extends AbstractManifestCommitterTest {
														
 
															+
														
 
															+  /**
														
 
															+   * Statistic to look for.
														
 
															+   */
														
 
															+  public static final String RENAME_FAILURES = OP_COMMIT_FILE_RENAME + ".failures";
														
 
															+  private static final int FAILING_FILE_INDEX = 5;
														
 
															+
														
 
															+  /**
														
 
															+   * Fault Injection.
														
 
															+   */
														
 
															+  private UnreliableManifestStoreOperations failures;
														
 
															+
														
 
															+  /** etags returned in listing/file status operations? */
														
 
															+  private boolean etagsSupported;
														
 
															+
														
 
															+  /** etags preserved through rename? */
														
 
															+  private boolean etagsPreserved;
														
 
															+
														
 
															+  /** resilient commit expected? */
														
 
															+  private boolean resilientCommit;
														
 
															+
														
 
															+  protected boolean isResilientCommit() {
														
 
															+    return resilientCommit;
														
 
															+  }
														
 
															+
														
 
															+  protected boolean isEtagsPreserved() {
														
 
															+    return etagsPreserved;
														
 
															+  }
														
 
															+
														
 
															+  protected boolean isEtagsSupported() {
														
 
															+    return etagsSupported;
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void setup() throws Exception {
														
 
															+    super.setup();
														
 
															+    final FileSystem fs = getFileSystem();
														
 
															+    final Path methodPath = methodPath();
														
 
															+    etagsSupported = fs.hasPathCapability(methodPath,
														
 
															+        CommonPathCapabilities.ETAGS_AVAILABLE);
														
 
															+    etagsPreserved = fs.hasPathCapability(methodPath,
														
 
															+        CommonPathCapabilities.ETAGS_PRESERVED_IN_RENAME);
														
 
															+
														
 
															+    final ManifestStoreOperations wrappedOperations = getStoreOperations();
														
 
															+    failures
														
 
															+        = new UnreliableManifestStoreOperations(wrappedOperations);
														
 
															+    setStoreOperations(failures);
														
 
															+    resilientCommit = wrappedOperations.storeSupportsResilientCommit();
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Does this test suite require rename resilience in the store/FS?
														
 
															+   * @return true if the store operations are resilient.
														
 
															+   */
														
 
															+  protected boolean requireRenameResilience() {
														
 
															+    return false;
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testResilienceAsExpected() throws Throwable {
														
 
															+    Assertions.assertThat(isResilientCommit())
														
 
															+        .describedAs("resilient commit support")
														
 
															+        .isEqualTo(requireRenameResilience());
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testRenameSourceException() throws Throwable {
														
 
															+    describe("rename fails raising an IOE -expect stage to fail" +
														
 
															+        " and exception message preserved");
														
 
															+
														
 
															+    // destination directory.
														
 
															+    Path destDir = methodPath();
														
 
															+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
														
 
															+    Path jobAttemptTaskSubDir = stageConfig.getJobAttemptTaskSubDir();
														
 
															+
														
 
															+    // create a manifest with a lot of files, but for
														
 
															+    // which one of whose renames will fail
														
 
															+    TaskManifest manifest = new TaskManifest();
														
 
															+    createFileset(destDir, jobAttemptTaskSubDir, manifest, filesToCreate());
														
 
															+    final List<FileEntry> filesToCommit = manifest.getFilesToCommit();
														
 
															+    final FileEntry entry = filesToCommit.get(FAILING_FILE_INDEX);
														
 
															+    failures.addRenameSourceFilesToFail(entry.getSourcePath());
														
 
															+
														
 
															+    // rename MUST fail
														
 
															+    expectRenameFailure(
														
 
															+        new RenameFilesStage(stageConfig),
														
 
															+        manifest,
														
 
															+        filesToCommit.size(),
														
 
															+        SIMULATED_FAILURE,
														
 
															+        PathIOException.class);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Number of files to create; must be more than
														
 
															+   * {@link #FAILING_FILE_INDEX}.
														
 
															+   */
														
 
															+  protected int filesToCreate() {
														
 
															+    return 100;
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testCommitMissingFile() throws Throwable {
														
 
															+    describe("commit a file which doesn't exist. Expect FNFE always");
														
 
															+    // destination directory.
														
 
															+    Path destDir = methodPath();
														
 
															+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
														
 
															+    Path jobAttemptTaskSubDir = stageConfig.getJobAttemptTaskSubDir();
														
 
															+    TaskManifest manifest = new TaskManifest();
														
 
															+    final List<FileEntry> filesToCommit = manifest.getFilesToCommit();
														
 
															+
														
 
															+    Path source = new Path(jobAttemptTaskSubDir, "source.parquet");
														
 
															+    Path dest = new Path(destDir, "destdir.parquet");
														
 
															+    filesToCommit.add(new FileEntry(source, dest, 0, null));
														
 
															+    final FileNotFoundException ex = expectRenameFailure(
														
 
															+        new RenameFilesStage(stageConfig),
														
 
															+        manifest,
														
 
															+        0,
														
 
															+        "",
														
 
															+        FileNotFoundException.class);
														
 
															+    LOG.info("Exception raised: {}", ex.toString());
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify that when a job is configured to delete target paths,
														
 
															+   * renaming will overwrite them.
														
 
															+   * This test has to use FileSystem contract settings to determine
														
 
															+   * whether or not the FS will actually permit file-over-file rename.
														
 
															+   * As POSIX does, local filesystem tests will not fail if the
														
 
															+   * destination exists.
														
 
															+   * As ABFS and GCS do reject it, they are required to fail the
														
 
															+   * first rename sequence, but succeed once delete.target.paths
														
 
															+   * is true.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testDeleteTargetPaths() throws Throwable {
														
 
															+    describe("Verify that target path deletion works");
														
 
															+    // destination directory.
														
 
															+    Path destDir = methodPath();
														
 
															+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir)
														
 
															+        .withDeleteTargetPaths(true);
														
 
															+    Path jobAttemptTaskSubDir = stageConfig.getJobAttemptTaskSubDir();
														
 
															+    final Path source = new Path(jobAttemptTaskSubDir, "source.txt");
														
 
															+    final Path dest = new Path(destDir, "source.txt");
														
 
															+    final byte[] sourceData = "data".getBytes(StandardCharsets.UTF_8);
														
 
															+    final FileSystem fs = getFileSystem();
														
 
															+    ContractTestUtils.createFile(fs, source, false, sourceData);
														
 
															+    touch(fs, dest);
														
 
															+    TaskManifest manifest = new TaskManifest();
														
 
															+    final FileEntry entry = createEntryWithEtag(source, dest);
														
 
															+    manifest.addFileToCommit(entry);
														
 
															+
														
 
															+    List<TaskManifest> manifests = new ArrayList<>();
														
 
															+    manifests.add(manifest);
														
 
															+
														
 
															+    // local POSIX filesystems allow rename of file onto file, so
														
 
															+    // don't fail on the rename.
														
 
															+    boolean renameOverwritesDest = isSupported(RENAME_OVERWRITES_DEST);
														
 
															+
														
 
															+    if (!renameOverwritesDest) {
														
 
															+      // HDFS, ABFS and GCS do all reject rename of file onto file.
														
 
															+      // ABFS will use its rename operation so will even raise a
														
 
															+      // meaningful exception here.
														
 
															+      final IOException ex = expectRenameFailure(
														
 
															+          new RenameFilesStage(stageConfig.withDeleteTargetPaths(false)),
														
 
															+          manifest,
														
 
															+          0,
														
 
															+          "",
														
 
															+          IOException.class);
														
 
															+      LOG.info("Exception raised: {}", ex.toString());
														
 
															+    }
														
 
															+
														
 
															+    // delete target paths and it works
														
 
															+    new RenameFilesStage(stageConfig.withDeleteTargetPaths(true))
														
 
															+        .apply(Pair.of(manifests, Collections.emptySet()));
														
 
															+
														
 
															+    // and the new data made it over
														
 
															+    verifyFileContents(fs, dest, sourceData);
														
 
															+
														
 
															+    // lets check the etag too, for completeness
														
 
															+    if (isEtagsPreserved()) {
														
 
															+      Assertions.assertThat(getEtag(fs.getFileStatus(dest)))
														
 
															+          .describedAs("Etag of destination file %s", dest)
														
 
															+          .isEqualTo(entry.getEtag());
														
 
															+    }
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testRenameReturnsFalse() throws Throwable {
														
 
															+    describe("commit where rename() returns false for one file." +
														
 
															+        " Expect failure to be escalated to an IOE");
														
 
															+
														
 
															+    Assume.assumeTrue("not used when resilient commits are available",
														
 
															+        !resilientCommit);
														
 
															+    // destination directory.
														
 
															+    Path destDir = methodPath();
														
 
															+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
														
 
															+    Path jobAttemptTaskSubDir = stageConfig.getJobAttemptTaskSubDir();
														
 
															+
														
 
															+    // create a manifest with a lot of files, but for
														
 
															+    // which one of whose renames will fail
														
 
															+    TaskManifest manifest = new TaskManifest();
														
 
															+    createFileset(destDir, jobAttemptTaskSubDir, manifest, filesToCreate());
														
 
															+
														
 
															+    final List<FileEntry> filesToCommit = manifest.getFilesToCommit();
														
 
															+    final FileEntry entry = filesToCommit.get(FAILING_FILE_INDEX);
														
 
															+    failures.addRenameSourceFilesToFail(entry.getSourcePath());
														
 
															+
														
 
															+    // switch to rename returning false.; again, this must
														
 
															+    // be escalated to a failure.
														
 
															+    failures.setRenameToFailWithException(false);
														
 
															+    expectRenameFailure(
														
 
															+        new RenameFilesStage(stageConfig),
														
 
															+        manifest,
														
 
															+        filesToCommit.size(),
														
 
															+        FAILED_TO_RENAME_PREFIX,
														
 
															+        PathIOException.class);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create the source files for a task.
														
 
															+   * @param destDir destination directory
														
 
															+   * @param taskAttemptDir directory of the task attempt
														
 
															+   * @param manifest manifest to update.
														
 
															+   * @param fileCount how many files.
														
 
															+   */
														
 
															+  private void createFileset(
														
 
															+      final Path destDir,
														
 
															+      final Path taskAttemptDir,
														
 
															+      final TaskManifest manifest,
														
 
															+      final int fileCount) throws IOException {
														
 
															+    final FileSystem fs = getFileSystem();
														
 
															+    for (int i = 0; i < fileCount; i++) {
														
 
															+      String name = String.format("file%04d", i);
														
 
															+      Path src = new Path(taskAttemptDir, name);
														
 
															+      Path dest = new Path(destDir, name);
														
 
															+      touch(fs, src);
														
 
															+
														
 
															+      final FileEntry entry = createEntryWithEtag(src, dest);
														
 
															+      manifest.addFileToCommit(entry);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Create a manifest entry, including size.
														
 
															+   * If the FS supports etags, one is retrieved.
														
 
															+   * @param source source
														
 
															+   * @param dest dest
														
 
															+   * @return entry
														
 
															+   * @throws IOException if getFileStatus failed.
														
 
															+   */
														
 
															+  private FileEntry createEntryWithEtag(final Path source,
														
 
															+      final Path dest)
														
 
															+      throws IOException {
														
 
															+    final FileStatus st = getFileSystem().getFileStatus(source);
														
 
															+    final String etag = isEtagsSupported()
														
 
															+        ? getEtag(st)
														
 
															+        : null;
														
 
															+
														
 
															+    return new FileEntry(source, dest, st.getLen(), etag);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Execute rename, expecting a failure.
														
 
															+   * The number of files renamed MUST be less than the value of {@code files}
														
 
															+   * @param stage stage
														
 
															+   * @param manifest task manifests
														
 
															+   * @param files number of files being renamed.
														
 
															+   * @param errorText text which must be in the exception string
														
 
															+   * @param exceptionClass class of the exception
														
 
															+   * @return the caught exception
														
 
															+   * @throws Exception if anything else went wrong, or no exception was raised.
														
 
															+   */
														
 
															+  private <E extends Throwable> E expectRenameFailure(
														
 
															+      RenameFilesStage stage,
														
 
															+      TaskManifest manifest,
														
 
															+      int files,
														
 
															+      String errorText,
														
 
															+      Class<E> exceptionClass) throws Exception {
														
 
															+
														
 
															+    List<TaskManifest> manifests = new ArrayList<>();
														
 
															+    manifests.add(manifest);
														
 
															+    ProgressCounter progressCounter = getProgressCounter();
														
 
															+    progressCounter.reset();
														
 
															+    IOStatisticsStore iostatistics = stage.getIOStatistics();
														
 
															+    long failures0 = iostatistics.counters().get(RENAME_FAILURES);
														
 
															+
														
 
															+    // rename MUST raise an exception.
														
 
															+    E ex = intercept(exceptionClass, errorText, () ->
														
 
															+        stage.apply(Pair.of(manifests, Collections.emptySet())));
														
 
															+
														
 
															+    LOG.info("Statistics {}", ioStatisticsToPrettyString(iostatistics));
														
 
															+    // the IOStatistics record the rename as a failure.
														
 
															+    assertThatStatisticCounter(iostatistics, RENAME_FAILURES)
														
 
															+        .isEqualTo(failures0 + 1);
														
 
															+
														
 
															+    // count of files committed MUST be less than expected.
														
 
															+    if (files > 0) {
														
 
															+
														
 
															+      Assertions.assertThat(stage.getFilesCommitted())
														
 
															+          .describedAs("Files Committed by stage")
														
 
															+          .isNotEmpty()
														
 
															+          .hasSizeLessThan(files);
														
 
															+
														
 
															+    }
														
 
															+
														
 
															+    // the progress counter will show that the rename did invoke it.
														
 
															+    // there's no assertion on the actual value as it depends on
														
 
															+    // execution time of the threads.
														
 
															+
														
 
															+    Assertions.assertThat(progressCounter.value())
														
 
															+        .describedAs("Progress counter %s", progressCounter)
														
 
															+        .isGreaterThan(0);
														
 
															+    return ex;
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestTaskManifestFileIO.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestTaskManifestFileIO.java
@@ -0,0 +1,185 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+import org.junit.Test;
														
 
															+
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.util.JsonSerialization;
														
 
															+
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.assertDirEntryMatch;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.assertFileEntryMatch;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.marshallPath;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.unmarshallPath;
														
 
															+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
														
 
															+
														
 
															+/**
														
 
															+ * Test Reading/writing manifest files.
														
 
															+ */
														
 
															+public class TestTaskManifestFileIO extends AbstractManifestCommitterTest {
														
 
															+
														
 
															+  private TaskManifest source;
														
 
															+
														
 
															+  private ManifestCommitterTestSupport.JobAndTaskIDsForTests taskIDs;
														
 
															+
														
 
															+  private String taskAttempt00;
														
 
															+
														
 
															+  private Path testPath;
														
 
															+
														
 
															+  private Path taPath;
														
 
															+
														
 
															+  @Override
														
 
															+  public void setup() throws Exception {
														
 
															+    super.setup();
														
 
															+    taskIDs = new ManifestCommitterTestSupport.JobAndTaskIDsForTests(2, 2);
														
 
															+    source = new TaskManifest();
														
 
															+    taskAttempt00 = taskIDs.getTaskAttempt(0, 0);
														
 
															+    source.setTaskAttemptID(taskAttempt00);
														
 
															+    testPath = methodPath();
														
 
															+    taPath = new Path(testPath, "  " + taskAttempt00);
														
 
															+    source.setTaskAttemptDir(marshallPath(taPath));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Test marshalling, paying attention to paths with spaces in them
														
 
															+   * as they've been a source of trouble in the S3A committers.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testJsonRoundTrip() throws Throwable {
														
 
															+    describe("Save manifest file to string and back");
														
 
															+    Path subdirS = new Path(taPath, "subdir");
														
 
															+    Path subdirD = new Path(testPath, "subdir");
														
 
															+    source.addDirectory(DirEntry.dirEntry(subdirD, 0, 0));
														
 
															+
														
 
															+    // a file
														
 
															+    Path subfileS = new Path(subdirS, "file");
														
 
															+    Path subfileD = new Path(subdirD, "file");
														
 
															+    long len = 256L;
														
 
															+    FileEntry subFileEntry = new FileEntry(subfileS,
														
 
															+        subfileD, len, "etag");
														
 
															+    source.addFileToCommit(subFileEntry);
														
 
															+
														
 
															+
														
 
															+    JsonSerialization<TaskManifest> serializer
														
 
															+        = TaskManifest.serializer();
														
 
															+
														
 
															+    String json = serializer.toJson(source);
														
 
															+    LOG.info("serialized form\n{}", json);
														
 
															+    TaskManifest deser = serializer.fromJson(json);
														
 
															+    deser.validate();
														
 
															+
														
 
															+    Assertions.assertThat(deser.getTaskAttemptID())
														
 
															+        .describedAs("Task attempt ID")
														
 
															+        .isEqualTo(taskAttempt00);
														
 
															+
														
 
															+    Assertions.assertThat(unmarshallPath(deser.getTaskAttemptDir()))
														
 
															+        .describedAs("Task attempt Dir %s",
														
 
															+            deser.getTaskAttemptDir())
														
 
															+        .isEqualTo(taPath);
														
 
															+
														
 
															+    Assertions.assertThat(deser.getDestDirectories())
														
 
															+        .hasSize(1)
														
 
															+        .allSatisfy(d -> assertDirEntryMatch(d, subdirD, 0));
														
 
															+    Assertions.assertThat(deser.getFilesToCommit())
														
 
															+        .hasSize(1)
														
 
															+        .allSatisfy(d -> assertFileEntryMatch(d, subfileS, subfileD, len));
														
 
															+    final FileEntry entry = deser.getFilesToCommit().get(0);
														
 
															+    assertFileEntryMatch(entry, subfileS, subfileD, len);
														
 
															+    Assertions.assertThat(entry.getEtag())
														
 
															+        .describedAs("etag of %s", entry)
														
 
															+        .isEqualTo("etag");
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * The manifest validation logic has a safety check that only one
														
 
															+   * file can rename to the same destination, and that the entries
														
 
															+   * are valid.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testValidateRejectsTwoCommitsToSameDest() throws Throwable {
														
 
															+
														
 
															+    Path subdirS = new Path(taPath, "subdir");
														
 
															+    Path subdirD = new Path(testPath, "subdir");
														
 
															+    source.addDirectory(DirEntry.dirEntry(subdirD, 0, 0));
														
 
															+
														
 
															+    // a file
														
 
															+    Path subfileS = new Path(subdirS, "file");
														
 
															+    Path subfileS2 = new Path(subdirS, "file2");
														
 
															+    Path subfileD = new Path(subdirD, "file");
														
 
															+    long len = 256L;
														
 
															+    source.addFileToCommit(
														
 
															+        new FileEntry(subfileS, subfileD, len, "tag1"));
														
 
															+    source.addFileToCommit(
														
 
															+        new FileEntry(subfileS2, subfileD, len, "tag2"));
														
 
															+    assertValidationFailureOnRoundTrip(source);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * The manifest validation logic has a safety check that only one
														
 
															+   * file can rename to the same destination, and that the entries
														
 
															+   * are valid.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testValidateRejectsIncompleteFileEntry() throws Throwable {
														
 
															+    source.addFileToCommit(
														
 
															+        new FileEntry(taPath, null, 0, null));
														
 
															+    assertValidationFailureOnRoundTrip(source);
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * negative lengths are not allowed.
														
 
															+   */
														
 
															+  @Test
														
 
															+  public void testValidateRejectsInvalidFileLength() throws Throwable {
														
 
															+    source.addFileToCommit(
														
 
															+        new FileEntry(taPath, testPath, -1, null));
														
 
															+    assertValidationFailureOnRoundTrip(source);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testRejectIncompatibleVersion() throws Throwable {
														
 
															+    source.setVersion(5);
														
 
															+    assertValidationFailureOnRoundTrip(source);
														
 
															+  }
														
 
															+
														
 
															+  @Test
														
 
															+  public void testRejectIncompatibleType() throws Throwable {
														
 
															+    source.setType("Incompatible type");
														
 
															+    assertValidationFailureOnRoundTrip(source);
														
 
															+  }
														
 
															+
														
 
															+  private void assertValidationFailureOnRoundTrip(
														
 
															+      final TaskManifest manifest) throws Exception {
														
 
															+    JsonSerialization<TaskManifest> serializer
														
 
															+        = TaskManifest.serializer();
														
 
															+    String json = serializer.toJson(manifest);
														
 
															+    LOG.info("serialized form\n{}", json);
														
 
															+    TaskManifest deser = serializer.fromJson(json);
														
 
															+    intercept(IOException.class, deser::validate);
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TextOutputForTests.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TextOutputForTests.java
@@ -0,0 +1,136 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.io.DataOutputStream;
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.conf.Configuration;
														
 
															+import org.apache.hadoop.fs.FSDataOutputStream;
														
 
															+import org.apache.hadoop.fs.FileSystem;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.io.compress.CompressionCodec;
														
 
															+import org.apache.hadoop.io.compress.GzipCodec;
														
 
															+import org.apache.hadoop.mapreduce.MRJobConfig;
														
 
															+import org.apache.hadoop.mapreduce.OutputFormat;
														
 
															+import org.apache.hadoop.mapreduce.TaskAttemptContext;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
														
 
															+import org.apache.hadoop.util.ReflectionUtils;
														
 
															+
														
 
															+/**
														
 
															+ * A subclass of {@link TextOutputFormat} which logs what is happening, and
														
 
															+ * returns a {@link LoggingLineRecordWriter} which allows the caller
														
 
															+ * to get the destination path.
														
 
															+ * @param <K> key
														
 
															+ * @param <V> value
														
 
															+ */
														
 
															+public class TextOutputForTests<K, V> extends TextOutputFormat<K, V> {
														
 
															+
														
 
															+  private static final Logger LOG =
														
 
															+      LoggerFactory.getLogger(TextOutputForTests.class);
														
 
															+
														
 
															+  public static final String NAME
														
 
															+      = "org.apache.hadoop.fs.s3a.commit.LoggingTextOutputFormat";
														
 
															+
														
 
															+  @Override
														
 
															+  public LoggingLineRecordWriter<K, V> getRecordWriter(TaskAttemptContext job)
														
 
															+      throws IOException, InterruptedException {
														
 
															+    Configuration conf = job.getConfiguration();
														
 
															+    boolean isCompressed = getCompressOutput(job);
														
 
															+    String keyValueSeparator = conf.get(SEPARATOR, "\t");
														
 
															+    CompressionCodec codec = null;
														
 
															+    String extension = "";
														
 
															+    if (isCompressed) {
														
 
															+      Class<? extends CompressionCodec> codecClass =
														
 
															+          getOutputCompressorClass(job, GzipCodec.class);
														
 
															+      codec = ReflectionUtils.newInstance(codecClass, conf);
														
 
															+      extension = codec.getDefaultExtension();
														
 
															+    }
														
 
															+    Path file = getDefaultWorkFile(job, extension);
														
 
															+    FileSystem fs = file.getFileSystem(conf);
														
 
															+    FSDataOutputStream fileOut = fs.create(file, true);
														
 
															+    LOG.debug("Creating LineRecordWriter with destination {}", file);
														
 
															+    if (isCompressed) {
														
 
															+      return new LoggingLineRecordWriter<>(
														
 
															+          file, new DataOutputStream(codec.createOutputStream(fileOut)),
														
 
															+          keyValueSeparator);
														
 
															+    } else {
														
 
															+      return new LoggingLineRecordWriter<>(file, fileOut, keyValueSeparator);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Write a line; counts the number of lines written and logs @ debug in the
														
 
															+   * {@code close()} call.
														
 
															+   * @param <K> key
														
 
															+   * @param <V> value
														
 
															+   */
														
 
															+  public static class LoggingLineRecordWriter<K, V>
														
 
															+      extends LineRecordWriter<K, V> {
														
 
															+
														
 
															+    private final Path dest;
														
 
															+
														
 
															+    private long lines;
														
 
															+
														
 
															+    public LoggingLineRecordWriter(Path dest, DataOutputStream out,
														
 
															+        String keyValueSeparator) {
														
 
															+      super(out, keyValueSeparator);
														
 
															+      this.dest = dest;
														
 
															+    }
														
 
															+
														
 
															+    public LoggingLineRecordWriter(DataOutputStream out, Path dest) {
														
 
															+      super(out);
														
 
															+      this.dest = dest;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public synchronized void write(K key, V value) throws IOException {
														
 
															+      super.write(key, value);
														
 
															+      lines++;
														
 
															+    }
														
 
															+
														
 
															+    public synchronized void close(TaskAttemptContext context)
														
 
															+        throws IOException {
														
 
															+      LOG.debug("Closing output file {} with {} lines :{}",
														
 
															+          dest, lines, out);
														
 
															+      out.close();
														
 
															+    }
														
 
															+
														
 
															+    public Path getDest() {
														
 
															+      return dest;
														
 
															+    }
														
 
															+
														
 
															+    public long getLines() {
														
 
															+      return lines;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Bind to a configuration for job submission.
														
 
															+   * @param conf configuration
														
 
															+   */
														
 
															+  public static void bind(Configuration conf) {
														
 
															+    conf.setClass(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR,
														
 
															+        TextOutputForTests.class,
														
 
															+        OutputFormat.class);
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ThreadLeakTracker.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ThreadLeakTracker.java
@@ -0,0 +1,83 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import java.util.Set;
														
 
															+import java.util.TreeSet;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+import org.assertj.core.api.Assertions;
														
 
															+
														
 
															+/**
														
 
															+ * Thread Leak tracker.
														
 
															+ * Initialized with the current set of threads and some known-long-life
														
 
															+ * threads whose presence must not be considered a failure.
														
 
															+ */
														
 
															+public class ThreadLeakTracker {
														
 
															+
														
 
															+  private final Set<String> trackedThreads =
														
 
															+      listInitialThreadsForLifecycleChecks();
														
 
															+
														
 
															+  public void assertNoThreadLeakage() {
														
 
															+    Assertions.assertThat(getCurrentThreadNames())
														
 
															+        .describedAs("The threads at the end of the test run")
														
 
															+        .isSubsetOf(trackedThreads);
														
 
															+  }
														
 
															+
														
 
															+  public Set<String> getTrackedThreads() {
														
 
															+    return trackedThreads;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Get a set containing the names of all active threads,
														
 
															+   * stripping out all test runner threads.
														
 
															+   * @return the current set of threads.
														
 
															+   */
														
 
															+  public static Set<String> getCurrentThreadNames() {
														
 
															+    TreeSet<String> threads = Thread.getAllStackTraces().keySet()
														
 
															+        .stream()
														
 
															+        .map(Thread::getName)
														
 
															+        .filter(n -> n.startsWith("JUnit"))
														
 
															+        .filter(n -> n.startsWith("surefire"))
														
 
															+        .collect(Collectors.toCollection(TreeSet::new));
														
 
															+    return threads;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * This creates a set containing all current threads and some well-known
														
 
															+   * thread names whose existence should not fail test runs.
														
 
															+   * They are generally static cleaner threads created by various classes
														
 
															+   * on instantiation.
														
 
															+   * @return a set of threads to use in later assertions.
														
 
															+   */
														
 
															+  public static Set<String> listInitialThreadsForLifecycleChecks() {
														
 
															+    Set<String> threadSet = getCurrentThreadNames();
														
 
															+    // static filesystem statistics cleaner
														
 
															+    threadSet.add("org.apache.hadoop.fs.FileSystem$Statistics$StatisticsDataReferenceCleaner");
														
 
															+
														
 
															+    // java.lang.UNIXProcess. maybe if chmod is called?
														
 
															+    threadSet.add("process reaper");
														
 
															+    // once a quantile has been scheduled, the mutable quantile thread pool
														
 
															+    // is initialized; it has a minimum thread size of 1.
														
 
															+    threadSet.add("MutableQuantiles-0");
														
 
															+    // IDE?
														
 
															+    threadSet.add("Attach Listener");
														
 
															+    return threadSet;
														
 
															+  }
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/UnreliableManifestStoreOperations.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/UnreliableManifestStoreOperations.java
@@ -0,0 +1,380 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
														
 
															+
														
 
															+import java.io.FileNotFoundException;
														
 
															+import java.io.IOException;
														
 
															+import java.io.InterruptedIOException;
														
 
															+import java.util.HashSet;
														
 
															+import java.util.Set;
														
 
															+
														
 
															+import org.slf4j.Logger;
														
 
															+import org.slf4j.LoggerFactory;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
 
															+import org.apache.hadoop.fs.FileStatus;
														
 
															+import org.apache.hadoop.fs.Path;
														
 
															+import org.apache.hadoop.fs.PathIOException;
														
 
															+import org.apache.hadoop.fs.RemoteIterator;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
														
 
															+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
														
 
															+import org.apache.hadoop.util.JsonSerialization;
														
 
															+
														
 
															+import static java.util.Objects.requireNonNull;
														
 
															+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.OPERATION_TIMED_OUT;
														
 
															+
														
 
															+/**
														
 
															+ * Wrap an existing {@link ManifestStoreOperations} implementation and fail on
														
 
															+ * specific paths.
														
 
															+ * This is for testing. It could be implemented via
														
 
															+ * Mockito 2 spy code but is not so that:
														
 
															+ * 1. It can be backported to Hadoop versions using Mockito 1.x.
														
 
															+ * 2. It can be extended to use in production. This is why it is in
														
 
															+ * the production module -to allow for downstream tests to adopt it.
														
 
															+ * 3. You can actually debug what's going on.
														
 
															+ */
														
 
															+@InterfaceAudience.Private
														
 
															+@InterfaceStability.Unstable
														
 
															+public class UnreliableManifestStoreOperations extends ManifestStoreOperations {
														
 
															+
														
 
															+  private static final Logger LOG = LoggerFactory.getLogger(
														
 
															+      UnreliableManifestStoreOperations.class);
														
 
															+
														
 
															+  /**
														
 
															+   * The timeout message ABFS raises.
														
 
															+   */
														
 
															+  public static final String E_TIMEOUT
														
 
															+      = "Operation could not be completed within the specified time";
														
 
															+
														
 
															+  /**
														
 
															+   * Text to use in simulated failure exceptions.
														
 
															+   */
														
 
															+  public static final String SIMULATED_FAILURE = "Simulated failure";
														
 
															+
														
 
															+  /**
														
 
															+   * Underlying store operations to wrap.
														
 
															+   */
														
 
															+  private final ManifestStoreOperations wrappedOperations;
														
 
															+
														
 
															+  /**
														
 
															+   * Paths of delete operations to fail.
														
 
															+   */
														
 
															+  private final Set<Path> deletePathsToFail = new HashSet<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Paths of delete operations to time out, as ABFS may.
														
 
															+   */
														
 
															+  private final Set<Path> deletePathsToTimeOut = new HashSet<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Paths of List operations to fail.
														
 
															+   */
														
 
															+  private final Set<Path> listToFail = new HashSet<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Paths of mkdirs operations to fail.
														
 
															+   */
														
 
															+  private final Set<Path> mkdirsToFail = new HashSet<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Paths which don't exist.
														
 
															+   */
														
 
															+  private final Set<Path> pathNotFound = new HashSet<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Source file whose rename/commit will fail.
														
 
															+   */
														
 
															+  private final Set<Path> renameSourceFilesToFail = new HashSet<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Dest dir into which all renames/commits will fail.
														
 
															+   * Subdirectories under this are not checked.
														
 
															+   */
														
 
															+  private final Set<Path> renameDestDirsToFail = new HashSet<>();
														
 
															+
														
 
															+  /**
														
 
															+   * Path of save() to fail.
														
 
															+   */
														
 
															+  private final Set<Path> saveToFail = new HashSet<>();
														
 
															+
														
 
															+  /**
														
 
															+   * timeout sleep.
														
 
															+   */
														
 
															+  private int timeoutSleepTimeMillis;
														
 
															+
														
 
															+  /**
														
 
															+   * Should rename thrown an exception or just return false.
														
 
															+   */
														
 
															+  private boolean renameToFailWithException = true;
														
 
															+
														
 
															+  /**
														
 
															+   * Constructor.
														
 
															+   * @param wrappedOperations operations to wrap.
														
 
															+   */
														
 
															+  public UnreliableManifestStoreOperations(final ManifestStoreOperations wrappedOperations) {
														
 
															+    this.wrappedOperations = wrappedOperations;
														
 
															+  }
														
 
															+
														
 
															+
														
 
															+  /**
														
 
															+   * Reset everything.
														
 
															+   */
														
 
															+  public void reset() {
														
 
															+    deletePathsToFail.clear();
														
 
															+    deletePathsToTimeOut.clear();
														
 
															+    pathNotFound.clear();
														
 
															+    renameSourceFilesToFail.clear();
														
 
															+    renameDestDirsToFail.clear();
														
 
															+    timeoutSleepTimeMillis = 0;
														
 
															+  }
														
 
															+
														
 
															+  public int getTimeoutSleepTimeMillis() {
														
 
															+    return timeoutSleepTimeMillis;
														
 
															+  }
														
 
															+
														
 
															+  public void setTimeoutSleepTimeMillis(final int timeoutSleepTimeMillis) {
														
 
															+    this.timeoutSleepTimeMillis = timeoutSleepTimeMillis;
														
 
															+  }
														
 
															+
														
 
															+  public boolean getRenameToFailWithException() {
														
 
															+    return renameToFailWithException;
														
 
															+  }
														
 
															+
														
 
															+  public void setRenameToFailWithException(
														
 
															+      final boolean renameToFailWithException) {
														
 
															+    this.renameToFailWithException = renameToFailWithException;
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a path to the list of delete paths to fail.
														
 
															+   * @param path path to add.
														
 
															+   */
														
 
															+  public void addDeletePathToFail(Path path) {
														
 
															+    deletePathsToFail.add(requireNonNull(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a path to the list of delete paths to time out.
														
 
															+   * @param path path to add.
														
 
															+   */
														
 
															+  public void addDeletePathToTimeOut(Path path) {
														
 
															+    deletePathsToTimeOut.add(requireNonNull(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a path to the list of paths where list will fail.
														
 
															+   * @param path path to add.
														
 
															+   */
														
 
															+  public void addListToFail(Path path) {
														
 
															+    listToFail.add(requireNonNull(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a path to the list of mkdir calls to fail.
														
 
															+   * @param path path to add.
														
 
															+   */
														
 
															+  public void addMkdirsToFail(Path path) {
														
 
															+    mkdirsToFail.add(requireNonNull(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a path not found.
														
 
															+   * @param path path
														
 
															+   */
														
 
															+  public void addPathNotFound(Path path) {
														
 
															+    pathNotFound.add(requireNonNull(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a path to the list of rename source paths to fail.
														
 
															+   * @param path path to add.
														
 
															+   */
														
 
															+  public void addRenameSourceFilesToFail(Path path) {
														
 
															+    renameSourceFilesToFail.add(requireNonNull(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a path to the list of dest dirs to fail.
														
 
															+   * @param path path to add.
														
 
															+   */
														
 
															+  public void addRenameDestDirsFail(Path path) {
														
 
															+    renameDestDirsToFail.add(requireNonNull(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Add a path to the list of paths where save will fail.
														
 
															+   * @param path path to add.
														
 
															+   */
														
 
															+  public void addSaveToFail(Path path) {
														
 
															+    saveToFail.add(requireNonNull(path));
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Raise an exception if the path is in the set of target paths.
														
 
															+   * @param operation operation which failed.
														
 
															+   * @param path path to check
														
 
															+   * @param paths paths to probe for {@code path} being in.
														
 
															+   * @throws IOException simulated failure
														
 
															+   */
														
 
															+  private void maybeRaiseIOE(String operation, Path path, Set<Path> paths)
														
 
															+      throws IOException {
														
 
															+    if (paths.contains(path)) {
														
 
															+      LOG.info("Simulating failure of {} with {}", operation, path);
														
 
															+      throw new PathIOException(path.toString(),
														
 
															+          SIMULATED_FAILURE + " of " + operation);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Verify that a path is not on the file not found list.
														
 
															+   * @param path path
														
 
															+   * @throws FileNotFoundException if configured to fail.
														
 
															+   */
														
 
															+  private void verifyExists(Path path) throws FileNotFoundException {
														
 
															+    if (pathNotFound.contains(path)) {
														
 
															+      throw new FileNotFoundException(path.toString());
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  /**
														
 
															+   * Time out if the path is in the list of timeout paths.
														
 
															+   * Will sleep first, to help simulate delays.
														
 
															+   * @param operation operation which failed.
														
 
															+   * @param path path to check
														
 
															+   * @param paths paths to probe for {@code path} being in.
														
 
															+   * @throws IOException simulated timeout
														
 
															+   */
														
 
															+  private void maybeTimeout(String operation, Path path, Set<Path> paths)
														
 
															+      throws IOException {
														
 
															+    if (paths.contains(path)) {
														
 
															+      LOG.info("Simulating timeout of {} with {}", operation, path);
														
 
															+      try {
														
 
															+        if (timeoutSleepTimeMillis > 0) {
														
 
															+          Thread.sleep(timeoutSleepTimeMillis);
														
 
															+        }
														
 
															+      } catch (InterruptedException e) {
														
 
															+        throw new InterruptedIOException(e.toString());
														
 
															+      }
														
 
															+      throw new PathIOException(path.toString(),
														
 
															+          "ErrorCode=" + OPERATION_TIMED_OUT
														
 
															+              + " ErrorMessage=" + E_TIMEOUT);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public FileStatus getFileStatus(final Path path) throws IOException {
														
 
															+    verifyExists(path);
														
 
															+    return wrappedOperations.getFileStatus(path);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean delete(final Path path, final boolean recursive)
														
 
															+      throws IOException {
														
 
															+    String op = "delete";
														
 
															+    maybeTimeout(op, path, deletePathsToTimeOut);
														
 
															+    maybeRaiseIOE(op, path, deletePathsToFail);
														
 
															+    return wrappedOperations.delete(path, recursive);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean mkdirs(final Path path) throws IOException {
														
 
															+    maybeRaiseIOE("mkdirs", path, mkdirsToFail);
														
 
															+    return wrappedOperations.mkdirs(path);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean renameFile(final Path source, final Path dest)
														
 
															+      throws IOException {
														
 
															+    String op = "rename";
														
 
															+    if (renameToFailWithException) {
														
 
															+      maybeRaiseIOE(op, source, renameSourceFilesToFail);
														
 
															+      maybeRaiseIOE(op, dest.getParent(), renameDestDirsToFail);
														
 
															+    } else {
														
 
															+      if (renameSourceFilesToFail.contains(source)
														
 
															+          || renameDestDirsToFail.contains(dest.getParent())) {
														
 
															+        LOG.info("Failing rename({}, {})", source, dest);
														
 
															+        return false;
														
 
															+      }
														
 
															+    }
														
 
															+    return wrappedOperations.renameFile(source, dest);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public RemoteIterator<FileStatus> listStatusIterator(final Path path)
														
 
															+      throws IOException {
														
 
															+    verifyExists(path);
														
 
															+    maybeRaiseIOE("listStatus", path, listToFail);
														
 
															+    return wrappedOperations.listStatusIterator(path);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public TaskManifest loadTaskManifest(JsonSerialization<TaskManifest> serializer,
														
 
															+      final FileStatus st) throws IOException {
														
 
															+    verifyExists(st.getPath());
														
 
															+    return wrappedOperations.loadTaskManifest(serializer, st);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public <T extends AbstractManifestData<T>> void save(T manifestData,
														
 
															+      final Path path,
														
 
															+      final boolean overwrite) throws IOException {
														
 
															+    maybeRaiseIOE("save", path, saveToFail);
														
 
															+    wrappedOperations.save(manifestData, path, overwrite);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void msync(Path path) throws IOException {
														
 
															+    wrappedOperations.msync(path);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public String getEtag(FileStatus status) {
														
 
															+    return wrappedOperations.getEtag(status);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean storeSupportsResilientCommit() {
														
 
															+    return wrappedOperations.storeSupportsResilientCommit();
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public CommitFileResult commitFile(final FileEntry entry)
														
 
															+      throws IOException {
														
 
															+    if (renameToFailWithException) {
														
 
															+      maybeRaiseIOE("commitFile",
														
 
															+          entry.getSourcePath(), renameSourceFilesToFail);
														
 
															+      maybeRaiseIOE("commitFile",
														
 
															+          entry.getDestPath().getParent(), renameDestDirsToFail);
														
 
															+    }
														
 
															+    return wrappedOperations.commitFile(entry);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public boolean storePreservesEtagsThroughRenames(Path path) {
														
 
															+    return wrappedOperations.storePreservesEtagsThroughRenames(path);
														
 
															+  }
														
 
															+
														
 
															+  @Override
														
 
															+  public void close() throws IOException {
														
 
															+    wrappedOperations.close();
														
 
															+  }
														
 
															+
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/package-info.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/package-info.java
@@ -0,0 +1,29 @@
 
															+/*
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+/**
														
 
															+ * These test suites are subclassed in internal modules (hadoop-azure) and
														
 
															+ * externally for gcs integration testing.
														
 
															+ * Do test with both before making changes.
														
 
															+ */
														
 
															+@InterfaceAudience.LimitedPrivate("integration tests")
														
 
															+@InterfaceStability.Unstable
														
 
															+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience;
														
 
															+import org.apache.hadoop.classification.InterfaceStability;
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/contract/localfs.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/contract/localfs.xml
@@ -0,0 +1,138 @@
 
															+<!--
														
 
															+  ~ Licensed to the Apache Software Foundation (ASF) under one
														
 
															+  ~  or more contributor license agreements.  See the NOTICE file
														
 
															+  ~  distributed with this work for additional information
														
 
															+  ~  regarding copyright ownership.  The ASF licenses this file
														
 
															+  ~  to you under the Apache License, Version 2.0 (the
														
 
															+  ~  "License"); you may not use this file except in compliance
														
 
															+  ~  with the License.  You may obtain a copy of the License at
														
 
															+  ~
														
 
															+  ~       http://www.apache.org/licenses/LICENSE-2.0
														
 
															+  ~
														
 
															+  ~  Unless required by applicable law or agreed to in writing, software
														
 
															+  ~  distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+  ~  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+  ~  See the License for the specific language governing permissions and
														
 
															+  ~  limitations under the License.
														
 
															+  -->
														
 
															+
														
 
															+<configuration>
														
 
															+
														
 
															+<!--
														
 
															+This is a copy of the file
														
 
															+hadoop-common-project/hadoop-common/src/test/resources/contract/localfs.xml ;
														
 
															+it is needed for all contract tests which test against the local filesystem.
														
 
															+
														
 
															+case sensitivity and permission options are determined at run time from OS type.
														
 
															+-->
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.is-case-sensitive</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-unix-permissions</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <!--
														
 
															+  The remaining options are static
														
 
															+  -->
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.test.root-tests-enabled</name>
														
 
															+    <value>false</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.test.random-seek-count</name>
														
 
															+    <value>1000</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.rename-creates-dest-dirs</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.rename-overwrites-dest</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.rename-remove-dest-if-empty-dir</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <!--
														
 
															+  checksummed filesystems do not support append; see HADOOP-4292
														
 
															+  -->
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-append</name>
														
 
															+    <value>false</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-atomic-directory-delete</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-atomic-rename</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-block-locality</name>
														
 
															+    <value>false</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-concat</name>
														
 
															+    <value>false</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-seek</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-seek-on-closed-file</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <!-- checksum FS doesn't allow seeking past EOF -->
														
 
															+  <property>
														
 
															+    <name>fs.contract.rejects-seek-past-eof</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-strict-exceptions</name>
														
 
															+    <value>false</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-settimes</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-getfilestatus</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-settimes</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>fs.contract.supports-getfilestatus</name>
														
 
															+    <value>true</value>
														
 
															+  </property>
														
 
															+
														
 
															+</configuration>
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/core-site.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/core-site.xml
@@ -0,0 +1,33 @@
 
															+<?xml version="1.0"?>
														
 
															+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
														
 
															+<!--
														
 
															+  ~ Licensed to the Apache Software Foundation (ASF) under one
														
 
															+  ~  or more contributor license agreements.  See the NOTICE file
														
 
															+  ~  distributed with this work for additional information
														
 
															+  ~  regarding copyright ownership.  The ASF licenses this file
														
 
															+  ~  to you under the Apache License, Version 2.0 (the
														
 
															+  ~  "License"); you may not use this file except in compliance
														
 
															+  ~  with the License.  You may obtain a copy of the License at
														
 
															+  ~
														
 
															+  ~       http://www.apache.org/licenses/LICENSE-2.0
														
 
															+  ~
														
 
															+  ~  Unless required by applicable law or agreed to in writing, software
														
 
															+  ~  distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+  ~  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+  ~  See the License for the specific language governing permissions and
														
 
															+  ~  limitations under the License.
														
 
															+  -->
														
 
															+
														
 
															+<!-- Values used when running unit tests.  Specify any values in here that
														
 
															+     should override the default values. -->
														
 
															+
														
 
															+<configuration>
														
 
															+
														
 
															+  <property>
														
 
															+    <name>hadoop.tmp.dir</name>
														
 
															+    <value>target/build/test</value>
														
 
															+    <description>A base for other temporary directories.</description>
														
 
															+    <final>false</final>
														
 
															+  </property>
														
 
															+
														
 
															+</configuration>