Browse Source

MAPREDUCE-7341. Add an intermediate manifest committer for Azure and GCS

This is a mapreduce/spark output committer optimized for
performance and correctness on Azure ADLS Gen 2 storage
(via the abfs connector) and Google Cloud Storage
(via the external gcs connector library).

* It is safe to use with HDFS, however it has not been optimized
for that use.
* It is *not* safe for use with S3, and will fail if an attempt
is made to do so.

Contributed by Steve Loughran

Change-Id: I6f3502e79c578b9fd1a8c1485f826784b5421fca
Steve Loughran 3 years ago
parent
commit
7328c34ba5
65 changed files with 16579 additions and 6 deletions
  1. 6 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
  2. 5 5
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/PathOutputCommitterFactory.java
  3. 761 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitter.java
  4. 375 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConfig.java
  5. 240 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConstants.java
  6. 57 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterFactory.java
  7. 264 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterStatisticNames.java
  8. 137 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/AbstractManifestData.java
  9. 44 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/DiagnosticKeys.java
  10. 202 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/DirEntry.java
  11. 87 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/EntryStatus.java
  12. 189 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/FileEntry.java
  13. 144 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestPrinter.java
  14. 493 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestSuccessData.java
  15. 365 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/TaskManifest.java
  16. 41 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/package-info.java
  17. 94 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/AuditingIntegration.java
  18. 130 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/InternalConstants.java
  19. 374 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestCommitterSupport.java
  20. 291 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperations.java
  21. 187 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java
  22. 40 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/OutputValidationException.java
  23. 30 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/package-info.java
  24. 34 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/package-info.java
  25. 63 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbortTaskStage.java
  26. 942 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbstractJobOrTaskStage.java
  27. 511 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java
  28. 245 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitJobStage.java
  29. 115 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitTaskStage.java
  30. 423 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java
  31. 33 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/JobOrTaskStage.java
  32. 293 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/LoadManifestsStage.java
  33. 173 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/RenameFilesStage.java
  34. 79 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveSuccessFileStage.java
  35. 81 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java
  36. 63 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupJobStage.java
  37. 54 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupTaskStage.java
  38. 556 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java
  39. 37 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageEventCallbacks.java
  40. 214 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/TaskAttemptScanDirectoryStage.java
  41. 200 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/ValidateRenamedFilesStage.java
  42. 30 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/package-info.java
  43. 19 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml
  44. 605 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md
  45. 335 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_architecture.md
  46. 617 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_protocol.md
  47. 1085 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/AbstractManifestCommitterTest.java
  48. 152 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/CommitterTestBinding.java
  49. 420 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterTestSupport.java
  50. 109 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/StubStoreOperations.java
  51. 54 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TaggedFileStatus.java
  52. 142 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCleanupStage.java
  53. 126 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCommitTaskStage.java
  54. 307 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCreateOutputDirectoriesStage.java
  55. 601 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestJobThroughManifestCommitter.java
  56. 141 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestLoadManifestsStage.java
  57. 1801 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestManifestCommitProtocol.java
  58. 379 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestRenameStageFailure.java
  59. 185 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestTaskManifestFileIO.java
  60. 136 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TextOutputForTests.java
  61. 83 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ThreadLeakTracker.java
  62. 380 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/UnreliableManifestStoreOperations.java
  63. 29 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/package-info.java
  64. 138 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/contract/localfs.xml
  65. 33 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/core-site.xml

+ 6 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml

@@ -103,9 +103,14 @@
       <plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
         <artifactId>maven-surefire-plugin</artifactId>
-	<configuration>
+        <configuration>
           <systemPropertyVariables>
           <systemPropertyVariables>
             <log4j.configuration>file:///${project.basedir}/src/test/resources/log4j.properties</log4j.configuration>
             <log4j.configuration>file:///${project.basedir}/src/test/resources/log4j.properties</log4j.configuration>
+
+            <!-- project directory   (target/) on a maven build. -->
+            <project.build.directory>${project.build.directory}</project.build.directory>
+            <!-- this property is used in hadoop-azure with separate paths for parallel runs -->
+            <test.build.dir>${project.build.directory}</test.build.dir>
           </systemPropertyVariables>
           </systemPropertyVariables>
         </configuration>
         </configuration>
       </plugin>
       </plugin>

+ 5 - 5
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/PathOutputCommitterFactory.java

@@ -52,8 +52,8 @@ import org.apache.hadoop.util.ReflectionUtils;
  * attempt context and a possibly null path.
  * attempt context and a possibly null path.
  *
  *
  */
  */
-@InterfaceAudience.Private
-@InterfaceStability.Unstable
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
 public class PathOutputCommitterFactory extends Configured {
 public class PathOutputCommitterFactory extends Configured {
   private static final Logger LOG =
   private static final Logger LOG =
       LoggerFactory.getLogger(PathOutputCommitterFactory.class);
       LoggerFactory.getLogger(PathOutputCommitterFactory.class);
@@ -158,7 +158,7 @@ public class PathOutputCommitterFactory extends Configured {
           scheme);
           scheme);
       if (StringUtils.isNotEmpty(conf.getTrimmed(schemeKey))) {
       if (StringUtils.isNotEmpty(conf.getTrimmed(schemeKey))) {
         // it does, so use that key in the classname lookup
         // it does, so use that key in the classname lookup
-        LOG.debug("Using schema-specific factory for {}", outputPath);
+        LOG.info("Using schema-specific factory for {}", outputPath);
         key = schemeKey;
         key = schemeKey;
       } else {
       } else {
         LOG.debug("No scheme-specific factory defined in {}", schemeKey);
         LOG.debug("No scheme-specific factory defined in {}", schemeKey);
@@ -171,7 +171,7 @@ public class PathOutputCommitterFactory extends Configured {
     String trimmedValue = conf.getTrimmed(key, "");
     String trimmedValue = conf.getTrimmed(key, "");
     if (StringUtils.isEmpty(trimmedValue)) {
     if (StringUtils.isEmpty(trimmedValue)) {
       // empty/null value, use default
       // empty/null value, use default
-      LOG.debug("No output committer factory defined,"
+      LOG.info("No output committer factory defined,"
           + " defaulting to FileOutputCommitterFactory");
           + " defaulting to FileOutputCommitterFactory");
       factory = FileOutputCommitterFactory.class;
       factory = FileOutputCommitterFactory.class;
     } else {
     } else {
@@ -179,7 +179,7 @@ public class PathOutputCommitterFactory extends Configured {
       factory = conf.getClass(key,
       factory = conf.getClass(key,
           FileOutputCommitterFactory.class,
           FileOutputCommitterFactory.class,
           PathOutputCommitterFactory.class);
           PathOutputCommitterFactory.class);
-      LOG.debug("Using OutputCommitter factory class {} from key {}",
+      LOG.info("Using OutputCommitter factory class {} from key {}",
           factory, key);
           factory, key);
     }
     }
     return ReflectionUtils.newInstance(factory, conf);
     return ReflectionUtils.newInstance(factory, conf);

+ 761 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitter.java

@@ -0,0 +1,761 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.classification.VisibleForTesting;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobStatus;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperationsThroughFileSystem;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.AbortTaskStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitTaskStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageEventCallbacks;
+import org.apache.hadoop.util.functional.CloseableTaskPoolSubmitter;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.logIOStatisticsAtDebug;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_DIAGNOSTICS_MANIFEST_DIR;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_SUMMARY_REPORT_DIR;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASKS_COMPLETED_COUNT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASKS_FAILED_COUNT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME_RECOVERED;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_ABORT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.STAGE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration.updateCommonContextOnCommitterExit;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration.updateCommonContextOnCommitterEntry;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createIOStatisticsStore;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createJobSummaryFilename;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createManifestOutcome;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestPathForTask;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage.cleanupStageOptionsFromConfig;
+
+/**
+ * This is the Intermediate-Manifest committer.
+ * At every entry point it updates the thread's audit context with
+ * the current stage info; this is a placeholder for
+ * adding audit information to stores other than S3A.
+ *
+ * This is tagged as public/stable. This is mandatory
+ * for the classname and PathOutputCommitter implementation
+ * classes.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
+public class ManifestCommitter extends PathOutputCommitter implements
+    IOStatisticsSource, StageEventCallbacks {
+
+  public static final Logger LOG = LoggerFactory.getLogger(
+      ManifestCommitter.class);
+
+  /**
+   * Role: task committer.
+   */
+  public static final String TASK_COMMITTER = "task committer";
+
+  /**
+   * Role: job committer.
+   */
+  public static final String JOB_COMMITTER = "job committer";
+
+  /**
+   * Committer Configuration as extracted from
+   * the job/task context and set in the constructor.
+   */
+  private final ManifestCommitterConfig baseConfig;
+
+  /**
+   * Destination of the job.
+   */
+  private final Path destinationDir;
+
+  /**
+   * For tasks, the attempt directory.
+   * Null for jobs.
+   */
+  private final Path taskAttemptDir;
+
+  /**
+   * IOStatistics to update.
+   */
+  private final IOStatisticsStore iostatistics;
+
+  /**
+   *  The job Manifest Success data; only valid after a job successfully
+   *  commits.
+   */
+  private ManifestSuccessData successReport;
+
+  /**
+   * The active stage; is updated by a callback from within the stages.
+   */
+  private String activeStage;
+
+  /**
+   * The task manifest of the task commit.
+   * Null unless this is a task attempt and the
+   * task has successfully been committed.
+   */
+  private TaskManifest taskAttemptCommittedManifest;
+
+  /**
+   * Create a committer.
+   * @param outputPath output path
+   * @param context job/task context
+   * @throws IOException failure.
+   */
+  public ManifestCommitter(final Path outputPath,
+      final TaskAttemptContext context) throws IOException {
+    super(outputPath, context);
+    this.destinationDir = resolveDestinationDirectory(outputPath,
+        context.getConfiguration());
+    this.iostatistics = createIOStatisticsStore().build();
+    this.baseConfig = enterCommitter(
+        context.getTaskAttemptID() != null,
+        context);
+
+    this.taskAttemptDir = baseConfig.getTaskAttemptDir();
+    LOG.info("Created ManifestCommitter with JobID {},"
+            + " Task Attempt {} and destination {}",
+        context.getJobID(), context.getTaskAttemptID(), outputPath);
+  }
+
+  /**
+   * Committer method invoked; generates a config for it.
+   * Calls {@code #updateCommonContextOnCommitterEntry()}
+   * to update the audit context.
+   * @param isTask is this a task entry point?
+   * @param context context
+   * @return committer config
+   */
+  private ManifestCommitterConfig enterCommitter(boolean isTask,
+      JobContext context) {
+    ManifestCommitterConfig committerConfig =
+        new ManifestCommitterConfig(
+            getOutputPath(),
+            isTask ? TASK_COMMITTER : JOB_COMMITTER,
+            context,
+            iostatistics,
+            this);
+    updateCommonContextOnCommitterEntry(committerConfig);
+    return committerConfig;
+  }
+
+  /**
+   * Set up a job through a {@link SetupJobStage}.
+   * @param jobContext Context of the job whose output is being written.
+   * @throws IOException IO Failure.
+   */
+  @Override
+  public void setupJob(final JobContext jobContext) throws IOException {
+    ManifestCommitterConfig committerConfig = enterCommitter(false,
+        jobContext);
+    StageConfig stageConfig =
+        committerConfig
+            .createStageConfig()
+            .withOperations(createManifestStoreOperations())
+            .build();
+    // set up the job.
+    new SetupJobStage(stageConfig)
+        .apply(committerConfig.getCreateJobMarker());
+    logCommitterStatisticsAtDebug();
+  }
+
+  /**
+   * Set up a task through a {@link SetupTaskStage}.
+   * Classic FileOutputCommitter is a no-op here, relying
+   * on RecordWriters to create the dir implicitly on file
+   * create().
+   * FileOutputCommitter also uses the existence of that
+   * file as a flag to indicate task commit is needed.
+   * @param context task context.
+   * @throws IOException IO Failure.
+   */
+  @Override
+  public void setupTask(final TaskAttemptContext context)
+      throws IOException {
+    ManifestCommitterConfig committerConfig =
+        enterCommitter(true, context);
+    StageConfig stageConfig =
+        committerConfig
+            .createStageConfig()
+            .withOperations(createManifestStoreOperations())
+            .build();
+    // create task attempt dir; delete if present. Or fail?
+    new SetupTaskStage(stageConfig).apply("");
+    logCommitterStatisticsAtDebug();
+  }
+
+  /**
+   * Always return true.
+   * This way, even if there is no output, stats are collected.
+   * @param context task context.
+   * @return true
+   * @throws IOException IO Failure.
+   */
+  @Override
+  public boolean needsTaskCommit(final TaskAttemptContext context)
+      throws IOException {
+    LOG.info("Probe for needsTaskCommit({})",
+        context.getTaskAttemptID());
+    return true;
+  }
+
+  /**
+   * Failure during Job Commit is not recoverable from.
+   *
+   * @param jobContext
+   *          Context of the job whose output is being written.
+   * @return false, always
+   * @throws IOException never
+   */
+  @Override
+  public boolean isCommitJobRepeatable(final JobContext jobContext)
+      throws IOException {
+    LOG.info("Probe for isCommitJobRepeatable({}): returning false",
+        jobContext.getJobID());
+    return false;
+  }
+
+  /**
+   * Declare that task recovery is not supported.
+   * It would be, if someone added the code *and tests*.
+   * @param jobContext
+   *          Context of the job whose output is being written.
+   * @return false, always
+   * @throws IOException never
+   */
+  @Override
+  public boolean isRecoverySupported(final JobContext jobContext)
+      throws IOException {
+    LOG.info("Probe for isRecoverySupported({}): returning false",
+        jobContext.getJobID());
+    return false;
+  }
+
+  /**
+   *
+   * @param taskContext Context of the task whose output is being recovered
+   * @throws IOException always
+   */
+  @Override
+  public void recoverTask(final TaskAttemptContext taskContext)
+      throws IOException {
+    LOG.warn("Rejecting recoverTask({}) call", taskContext.getTaskAttemptID());
+    throw new IOException("Cannot recover task "
+        + taskContext.getTaskAttemptID());
+  }
+
+  /**
+   * Commit the task.
+   * This is where the task attempt tree list takes place.
+   * @param context task context.
+   * @throws IOException IO Failure.
+   */
+  @Override
+  public void commitTask(final TaskAttemptContext context)
+      throws IOException {
+    ManifestCommitterConfig committerConfig = enterCommitter(true,
+        context);
+    try {
+      StageConfig stageConfig = committerConfig.createStageConfig()
+          .withOperations(createManifestStoreOperations())
+          .build();
+      taskAttemptCommittedManifest = new CommitTaskStage(stageConfig)
+          .apply(null).getTaskManifest();
+      iostatistics.incrementCounter(COMMITTER_TASKS_COMPLETED_COUNT, 1);
+    } catch (IOException e) {
+      iostatistics.incrementCounter(COMMITTER_TASKS_FAILED_COUNT, 1);
+      throw e;
+    } finally {
+      logCommitterStatisticsAtDebug();
+      updateCommonContextOnCommitterExit();
+    }
+
+  }
+
+  /**
+   * Abort a task.
+   * @param context task context
+   * @throws IOException failure during the delete
+   */
+  @Override
+  public void abortTask(final TaskAttemptContext context)
+      throws IOException {
+    ManifestCommitterConfig committerConfig = enterCommitter(true,
+        context);
+    try {
+      new AbortTaskStage(
+          committerConfig.createStageConfig()
+              .withOperations(createManifestStoreOperations())
+              .build())
+          .apply(false);
+    } finally {
+      logCommitterStatisticsAtDebug();
+      updateCommonContextOnCommitterExit();
+    }
+  }
+
+  /**
+   * Get the manifest success data for this job; creating on demand if needed.
+   * @param committerConfig source config.
+   * @return the current {@link #successReport} value; never null.
+   */
+  private ManifestSuccessData getOrCreateSuccessData(
+      ManifestCommitterConfig committerConfig) {
+    if (successReport == null) {
+      successReport = createManifestOutcome(
+          committerConfig.createStageConfig(), activeStage);
+    }
+    return successReport;
+  }
+
+  /**
+   * This is the big job commit stage.
+   * Load the manifests, prepare the destination, rename
+   * the files then cleanup the job directory.
+   * @param jobContext Context of the job whose output is being written.
+   * @throws IOException failure.
+   */
+  @Override
+  public void commitJob(final JobContext jobContext) throws IOException {
+
+    ManifestCommitterConfig committerConfig = enterCommitter(false, jobContext);
+
+    // create the initial success data.
+    // this is overwritten by that created during the operation sequence,
+    // but if the sequence fails before that happens, it
+    // will be saved to the report directory.
+    ManifestSuccessData marker = getOrCreateSuccessData(committerConfig);
+    IOException failure = null;
+    try (CloseableTaskPoolSubmitter ioProcs =
+             committerConfig.createSubmitter();
+         ManifestStoreOperations storeOperations = createManifestStoreOperations()) {
+      // the stage config will be shared across all stages.
+      StageConfig stageConfig = committerConfig.createStageConfig()
+          .withOperations(storeOperations)
+          .withIOProcessors(ioProcs)
+          .build();
+
+      // commit the job, including any cleanup and validation.
+      final Configuration conf = jobContext.getConfiguration();
+      CommitJobStage.Result result = new CommitJobStage(stageConfig).apply(
+          new CommitJobStage.Arguments(
+              committerConfig.getCreateJobMarker(),
+              committerConfig.getValidateOutput(),
+              conf.getTrimmed(OPT_DIAGNOSTICS_MANIFEST_DIR, ""),
+              cleanupStageOptionsFromConfig(
+                  OP_STAGE_JOB_CLEANUP, conf)
+          ));
+      marker = result.getJobSuccessData();
+      // update the cached success with the new report.
+      setSuccessReport(marker);
+
+    } catch (IOException e) {
+      // failure. record it for the summary
+      failure = e;
+      // rethrow
+      throw e;
+    } finally {
+      // save the report summary, even on failure
+      maybeSaveSummary(activeStage,
+          committerConfig,
+          marker,
+          failure,
+          true,
+          true);
+      // print job commit stats
+      LOG.info("{}: Job Commit statistics {}",
+          committerConfig.getName(),
+          ioStatisticsToPrettyString(iostatistics));
+      // and warn of rename problems
+      final Long recoveries = iostatistics.counters().get(OP_COMMIT_FILE_RENAME_RECOVERED);
+      if (recoveries != null && recoveries > 0) {
+        LOG.warn("{}: rename failures were recovered from. Number of recoveries: {}",
+            committerConfig.getName(), recoveries);
+      }
+      updateCommonContextOnCommitterExit();
+    }
+  }
+
+  /**
+   * Abort the job.
+   * Invokes
+   * {@link #executeCleanup(String, JobContext, ManifestCommitterConfig)}
+   * then saves the (ongoing) job report data if reporting is enabled.
+   * @param jobContext Context of the job whose output is being written.
+   * @param state final runstate of the job
+   * @throws IOException failure during cleanup; report failure are swallowed
+   */
+  @Override
+  public void abortJob(final JobContext jobContext,
+      final JobStatus.State state)
+      throws IOException {
+    LOG.info("Aborting Job {} in state {}", jobContext.getJobID(), state);
+    ManifestCommitterConfig committerConfig = enterCommitter(false,
+        jobContext);
+    ManifestSuccessData report = getOrCreateSuccessData(
+        committerConfig);
+    IOException failure = null;
+
+    try {
+      executeCleanup(OP_STAGE_JOB_ABORT, jobContext, committerConfig);
+    } catch (IOException e) {
+      // failure.
+      failure = e;
+    }
+    report.setSuccess(false);
+    // job abort does not overwrite any existing report, so a job commit
+    // failure cause will be preserved.
+    maybeSaveSummary(activeStage, committerConfig, report, failure,
+        true, false);
+    // print job stats
+    LOG.info("Job Abort statistics {}",
+        ioStatisticsToPrettyString(iostatistics));
+    updateCommonContextOnCommitterExit();
+  }
+
+  /**
+   * Execute the {@code CleanupJobStage} to remove the job attempt dir.
+   * This does
+   * @param jobContext Context of the job whose output is being written.
+   * @throws IOException failure during cleanup
+   */
+  @SuppressWarnings("deprecation")
+  @Override
+  public void cleanupJob(final JobContext jobContext) throws IOException {
+    ManifestCommitterConfig committerConfig = enterCommitter(false,
+        jobContext);
+    try {
+      executeCleanup(OP_STAGE_JOB_CLEANUP, jobContext, committerConfig);
+    } finally {
+      logCommitterStatisticsAtDebug();
+      updateCommonContextOnCommitterExit();
+    }
+  }
+
+  /**
+   * Perform the cleanup operation for job cleanup or abort.
+   * @param statisticName statistic/stage name
+   * @param jobContext job context
+   * @param committerConfig committer config
+   * @throws IOException failure
+   * @return the outcome
+   */
+  private CleanupJobStage.Result executeCleanup(
+      final String statisticName,
+      final JobContext jobContext,
+      final ManifestCommitterConfig committerConfig) throws IOException {
+    try (CloseableTaskPoolSubmitter ioProcs =
+             committerConfig.createSubmitter()) {
+
+      return new CleanupJobStage(
+          committerConfig.createStageConfig()
+              .withOperations(createManifestStoreOperations())
+              .withIOProcessors(ioProcs)
+              .build())
+          .apply(cleanupStageOptionsFromConfig(
+              statisticName,
+              jobContext.getConfiguration()));
+    }
+  }
+
+  /**
+   * Output path: destination directory of the job.
+   * @return the overall job destination directory.
+   */
+  @Override
+  public Path getOutputPath() {
+    return getDestinationDir();
+  }
+
+  /**
+   * Work path of the current task attempt.
+   * This is null if the task does not have one.
+   * @return a path.
+   */
+  @Override
+  public Path getWorkPath() {
+    return getTaskAttemptDir();
+  }
+
+  /**
+   * Get the job destination dir.
+   * @return dest dir.
+   */
+  private Path getDestinationDir() {
+    return destinationDir;
+  }
+
+  /**
+   * Get the task attempt dir.
+   * May be null.
+   * @return a path or null.
+   */
+  private Path getTaskAttemptDir() {
+    return taskAttemptDir;
+  }
+
+  /**
+   * Callback on stage entry.
+   * Sets {@link #activeStage} and updates the
+   * common context.
+   * @param stage new stage
+   */
+  @Override
+  public void enterStage(String stage) {
+    activeStage = stage;
+    AuditingIntegration.enterStage(stage);
+  }
+
+  /**
+   * Remove stage from common audit context.
+   * @param stage stage exited.
+   */
+  @Override
+  public void exitStage(String stage) {
+    AuditingIntegration.exitStage();
+  }
+
+  /**
+   * Get the unique ID of this job.
+   * @return job ID (yarn, spark)
+   */
+  public String getJobUniqueId() {
+    return baseConfig.getJobUniqueId();
+  }
+
+  /**
+   * Get the config of the task attempt this instance was constructed
+   * with.
+   * @return a configuration.
+   */
+  public Configuration getConf() {
+    return baseConfig.getConf();
+  }
+
+  /**
+   * Get the manifest Success data; only valid after a job.
+   * @return the job _SUCCESS data, or null.
+   */
+  public ManifestSuccessData getSuccessReport() {
+    return successReport;
+  }
+
+  private void setSuccessReport(ManifestSuccessData successReport) {
+    this.successReport = successReport;
+  }
+
+  /**
+   * Get the manifest of the last committed task.
+   * @return a task manifest or null.
+   */
+  @VisibleForTesting
+  TaskManifest getTaskAttemptCommittedManifest() {
+    return taskAttemptCommittedManifest;
+  }
+
+  /**
+   * Compute the path where the output of a task attempt is stored until
+   * that task is committed.
+   * @param context the context of the task attempt.
+   * @return the path where a task attempt should be stored.
+   */
+  @VisibleForTesting
+  public Path getTaskAttemptPath(TaskAttemptContext context) {
+    return enterCommitter(false, context).getTaskAttemptDir();
+  }
+
+  /**
+   * The path to where the manifest file of a task attempt will be
+   * saved when the task is committed.
+   * This path will be the same for all attempts of the same task.
+   * @param context the context of the task attempt.
+   * @return the path where a task attempt should be stored.
+   */
+  @VisibleForTesting
+  public Path getTaskManifestPath(TaskAttemptContext context) {
+    final Path dir = enterCommitter(false, context).getTaskManifestDir();
+
+    return manifestPathForTask(dir,
+        context.getTaskAttemptID().getTaskID().toString());
+  }
+
+  /**
+   * Compute the path where the output of a task attempt is stored until
+   * that task is committed.
+   * @param context the context of the task attempt.
+   * @return the path where a task attempt should be stored.
+   */
+  @VisibleForTesting
+  public Path getJobAttemptPath(JobContext context) {
+
+    return enterCommitter(false, context).getJobAttemptDir();
+  }
+
+  /**
+   * Get the final output path, including resolving any relative path.
+   * @param outputPath output path
+   * @param conf configuration to create any FS with
+   * @return a resolved path.
+   * @throws IOException failure.
+   */
+  private Path resolveDestinationDirectory(Path outputPath,
+      Configuration conf) throws IOException {
+    return FileSystem.get(outputPath.toUri(), conf).makeQualified(outputPath);
+  }
+
+  /**
+   * Create manifest store operations for the destination store.
+   * This MUST NOT be used for the success report operations, as
+   * they may be to a different filesystem.
+   * This is a point which can be overridden during testing.
+   * @return a new store operations instance bonded to the destination fs.
+   * @throws IOException failure to instantiate.
+   */
+  protected ManifestStoreOperations createManifestStoreOperations() throws IOException {
+    return ManifestCommitterSupport.createManifestStoreOperations(
+        baseConfig.getConf(),
+        baseConfig.getDestinationFileSystem(),
+        baseConfig.getDestinationDir());
+  }
+
+  /**
+   * Log IO Statistics at debug.
+   */
+  private void logCommitterStatisticsAtDebug() {
+    logIOStatisticsAtDebug(LOG, "Committer Statistics", this);
+  }
+
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder(
+        "ManifestCommitter{");
+    sb.append(baseConfig);
+    sb.append(", iostatistics=").append(ioStatisticsToPrettyString(iostatistics));
+    sb.append('}');
+    return sb.toString();
+  }
+
+  /**
+   * Save a summary to the report dir if the config option
+   * is set.
+   * The IOStatistics of the summary will be updated to the latest
+   * snapshot of the committer's statistics, so the report is up
+   * to date.
+   * The report will updated with the current active stage,
+   * and if {@code thrown} is non-null, it will be added to the
+   * diagnistics (and the job tagged as a failure).
+   * Static for testability.
+   * @param activeStage active stage
+   * @param config configuration to use.
+   * @param report summary file.
+   * @param thrown any exception indicting failure.
+   * @param quiet should exceptions be swallowed.
+   * @param overwrite should the existing file be overwritten
+   * @return the path of a file, if successfully saved
+   * @throws IOException if a failure occured and quiet==false
+   */
+  private static Path maybeSaveSummary(
+      String activeStage,
+      ManifestCommitterConfig config,
+      ManifestSuccessData report,
+      Throwable thrown,
+      boolean quiet,
+      boolean overwrite) throws IOException {
+    Configuration conf = config.getConf();
+    String reportDir = conf.getTrimmed(OPT_SUMMARY_REPORT_DIR, "");
+    if (reportDir.isEmpty()) {
+      LOG.debug("No summary directory set in " + OPT_SUMMARY_REPORT_DIR);
+      return null;
+    }
+    LOG.debug("Summary directory set in to {}" + OPT_SUMMARY_REPORT_DIR,
+        reportDir);
+
+    // update to the latest statistics
+    report.snapshotIOStatistics(config.getIOStatistics());
+
+    Path reportDirPath = new Path(reportDir);
+    Path path = new Path(reportDirPath,
+        createJobSummaryFilename(config.getJobUniqueId()));
+
+    if (thrown != null) {
+      report.recordJobFailure(thrown);
+    }
+    report.putDiagnostic(STAGE, activeStage);
+    // the store operations here is explicitly created for the FS where
+    // the reports go, which may not be the target FS of the job.
+
+    final FileSystem fs = path.getFileSystem(conf);
+    try (ManifestStoreOperations operations = new ManifestStoreOperationsThroughFileSystem(fs)) {
+      if (!overwrite) {
+        // check for file existence so there is no need to worry about
+        // precisely what exception is raised when overwrite=false and dest file
+        // exists
+        try {
+          FileStatus st = operations.getFileStatus(path);
+          // get here and the file exists
+          LOG.debug("Report already exists: {}", st);
+          return null;
+        } catch (FileNotFoundException ignored) {
+        }
+      }
+      operations.save(report, path, overwrite);
+      LOG.info("Job summary saved to {}", path);
+      return path;
+    } catch (IOException e) {
+      LOG.debug("Failed to save summary to {}", path, e);
+      if (quiet) {
+        return null;
+      } else {
+        throw e;
+      }
+    }
+  }
+
+  @Override
+  public IOStatisticsStore getIOStatistics() {
+    return iostatistics;
+  }
+}

+ 375 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConfig.java

@@ -0,0 +1,375 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.IOException;
+import java.util.Objects;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageEventCallbacks;
+import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
+import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.concurrent.HadoopExecutors;
+import org.apache.hadoop.util.functional.CloseableTaskPoolSubmitter;
+
+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.*;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.buildJobUUID;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.getAppAttemptId;
+
+/**
+ * The configuration for the committer as built up from the job configuration
+ * and data passed down from the committer factory.
+ * Isolated for ease of dev/test
+ */
+public final class ManifestCommitterConfig implements IOStatisticsSource {
+
+  /**
+   * Final destination of work.
+   * This is <i>unqualified</i>.
+   */
+  private final Path destinationDir;
+
+  /**
+   * Role: used in log/text messages.
+   */
+  private final String role;
+
+  /**
+   * This is the directory for all intermediate work: where the output
+   * format will write data.
+   * Will be null if built from a job context.
+   */
+  private final Path taskAttemptDir;
+
+  /** Configuration of the job. */
+  private final Configuration conf;
+
+  /** The job context. For a task, this can be cast to a TaskContext. */
+  private final JobContext jobContext;
+
+  /** Should a job marker be created? */
+  private final boolean createJobMarker;
+
+  /**
+   * Job ID Or UUID -without any attempt suffix.
+   * This is expected/required to be unique, though
+   * Spark has had "issues" there until recently
+   * with lack of uniqueness of generated MR Job IDs.
+   */
+  private final String jobUniqueId;
+
+  /**
+   * Where did the job Unique ID come from?
+   */
+  private final String jobUniqueIdSource;
+
+  /**
+   * Number of this attempt; starts at zero.
+   */
+  private final int jobAttemptNumber;
+
+  /**
+   * Job ID + AttemptID.
+   */
+  private final String jobAttemptId;
+
+  /**
+   * Task ID: used as the filename of the manifest.
+   * Will be "" if built from a job context.
+   */
+  private final String taskId;
+
+  /**
+   * Task attempt ID. Determines the working
+   * directory for task attempts to write data into,
+   * and for the task committer to scan.
+   * Will be "" if built from a job context.
+   */
+  private final String taskAttemptId;
+
+  /** Any progressable for progress callbacks. */
+  private final Progressable progressable;
+
+  /**
+   * IOStatistics to update.
+   */
+  private final IOStatisticsStore iostatistics;
+
+
+  /** Should the output be validated after the commit? */
+  private final boolean validateOutput;
+
+  /**
+   * Attempt directory management.
+   */
+  private final ManifestCommitterSupport.AttemptDirectories dirs;
+
+  /**
+   * Callback when a stage is entered.
+   */
+  private final StageEventCallbacks stageEventCallbacks;
+
+  /**
+   * Name for logging.
+   */
+  private final String name;
+
+  /**
+   * Delete target paths on commit? Stricter, but
+   * higher IO cost.
+   */
+  private final boolean deleteTargetPaths;
+
+  /**
+   * Constructor.
+   * @param outputPath destination path of the job.
+   * @param role role for log messages.
+   * @param context job/task context
+   * @param iostatistics IO Statistics
+   * @param stageEventCallbacks stage event callbacks.
+   */
+
+  ManifestCommitterConfig(
+      final Path outputPath,
+      final String role,
+      final JobContext context,
+      final IOStatisticsStore iostatistics,
+      final StageEventCallbacks stageEventCallbacks) {
+    this.role = role;
+    this.jobContext = context;
+    this.conf = context.getConfiguration();
+    this.destinationDir = outputPath;
+    this.iostatistics = iostatistics;
+    this.stageEventCallbacks = stageEventCallbacks;
+
+    Pair<String, String> pair = buildJobUUID(conf, context.getJobID());
+    this.jobUniqueId = pair.getLeft();
+    this.jobUniqueIdSource = pair.getRight();
+    this.jobAttemptNumber = getAppAttemptId(context);
+    this.jobAttemptId = this.jobUniqueId + "_" + jobAttemptNumber;
+
+    // build directories
+    this.dirs = new ManifestCommitterSupport.AttemptDirectories(outputPath,
+        this.jobUniqueId, jobAttemptNumber);
+
+    // read in configuration options
+    this.createJobMarker = conf.getBoolean(
+        SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
+        DEFAULT_CREATE_SUCCESSFUL_JOB_DIR_MARKER);
+    this.validateOutput = conf.getBoolean(
+        OPT_VALIDATE_OUTPUT,
+        OPT_VALIDATE_OUTPUT_DEFAULT);
+    this.deleteTargetPaths = conf.getBoolean(
+        OPT_DELETE_TARGET_FILES,
+        OPT_DELETE_TARGET_FILES_DEFAULT);
+
+    // if constructed with a task attempt, build the task ID and path.
+    if (context instanceof TaskAttemptContext) {
+      // it's a task
+      final TaskAttemptContext tac = (TaskAttemptContext) context;
+      TaskAttemptID taskAttempt = Objects.requireNonNull(
+          tac.getTaskAttemptID());
+      taskAttemptId = taskAttempt.toString();
+      taskId = taskAttempt.getTaskID().toString();
+      // Task attempt dir; must be different across instances
+      taskAttemptDir = dirs.getTaskAttemptPath(taskAttemptId);
+      // the context is also the progress callback.
+      progressable = tac;
+      name = String.format(InternalConstants.NAME_FORMAT_TASK_ATTEMPT, taskAttemptId);
+
+    } else {
+      // it's a job
+      taskId = "";
+      taskAttemptId = "";
+      taskAttemptDir = null;
+      progressable = null;
+      name = String.format(InternalConstants.NAME_FORMAT_JOB_ATTEMPT, jobAttemptId);
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "ManifestCommitterConfig{" +
+        "name=" + name +
+        ", destinationDir=" + destinationDir +
+        ", role='" + role + '\'' +
+        ", taskAttemptDir=" + taskAttemptDir +
+        ", createJobMarker=" + createJobMarker +
+        ", jobUniqueId='" + jobUniqueId + '\'' +
+        ", jobUniqueIdSource='" + jobUniqueIdSource + '\'' +
+        ", jobAttemptNumber=" + jobAttemptNumber +
+        ", jobAttemptId='" + jobAttemptId + '\'' +
+        ", taskId='" + taskId + '\'' +
+        ", taskAttemptId='" + taskAttemptId + '\'' +
+        '}';
+  }
+
+  /**
+   * Get the destination filesystem.
+   * @return destination FS.
+   * @throws IOException Problems binding to the destination FS.
+   */
+  FileSystem getDestinationFileSystem() throws IOException {
+    return FileSystem.get(destinationDir.toUri(), conf);
+  }
+
+  /**
+   * Create the stage config from the committer
+   * configuration.
+   * This does not bind the store operations
+   * or processors.
+   * @return a stage config with configuration options passed in.
+   */
+  StageConfig createStageConfig() {
+    StageConfig stageConfig = new StageConfig();
+    stageConfig
+        .withIOStatistics(iostatistics)
+        .withJobAttemptNumber(jobAttemptNumber)
+        .withJobDirectories(dirs)
+        .withJobId(jobUniqueId)
+        .withJobIdSource(jobUniqueIdSource)
+        .withName(name)
+        .withProgressable(progressable)
+        .withStageEventCallbacks(stageEventCallbacks)
+        .withTaskAttemptDir(taskAttemptDir)
+        .withTaskAttemptId(taskAttemptId)
+        .withTaskId(taskId)
+        .withDeleteTargetPaths(deleteTargetPaths);
+
+    return stageConfig;
+  }
+
+  public Path getDestinationDir() {
+    return destinationDir;
+  }
+
+  public String getRole() {
+    return role;
+  }
+
+  public Path getTaskAttemptDir() {
+    return taskAttemptDir;
+  }
+
+  public Path getJobAttemptDir() {
+    return dirs.getJobAttemptDir();
+  }
+
+  public Path getTaskManifestDir() {
+    return dirs.getTaskManifestDir();
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public JobContext getJobContext() {
+    return jobContext;
+  }
+
+  public boolean getCreateJobMarker() {
+    return createJobMarker;
+  }
+
+  public String getJobAttemptId() {
+    return jobAttemptId;
+  }
+
+  public String getTaskAttemptId() {
+    return taskAttemptId;
+  }
+
+  public String getTaskId() {
+    return taskId;
+  }
+
+  public String getJobUniqueId() {
+    return jobUniqueId;
+  }
+
+  public boolean getValidateOutput() {
+    return validateOutput;
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  @Override
+  public IOStatisticsStore getIOStatistics() {
+    return iostatistics;
+  }
+
+  /**
+   * Create a new submitter task pool from the
+   * {@link ManifestCommitterConstants#OPT_IO_PROCESSORS}
+   * settings.
+   * @return a new thread pool.
+   */
+  public CloseableTaskPoolSubmitter createSubmitter() {
+    return createSubmitter(
+        OPT_IO_PROCESSORS, OPT_IO_PROCESSORS_DEFAULT);
+  }
+
+  /**
+   * Create a new submitter task pool.
+   * @param key config key with pool size.
+   * @param defVal default value.
+   * @return a new task pool.
+   */
+  public CloseableTaskPoolSubmitter createSubmitter(String key, int defVal) {
+    int numThreads = conf.getInt(key, defVal);
+    if (numThreads <= 0) {
+      // ignore the setting if it is too invalid.
+      numThreads = defVal;
+    }
+    return createCloseableTaskSubmitter(numThreads, getJobAttemptId());
+  }
+
+  /**
+   * Create a new submitter task pool.
+   *
+   * @param numThreads thread count.
+   * @param jobAttemptId job ID
+   * @return a new task pool.
+   */
+  public static CloseableTaskPoolSubmitter createCloseableTaskSubmitter(
+      final int numThreads,
+      final String jobAttemptId) {
+    return new CloseableTaskPoolSubmitter(
+        HadoopExecutors.newFixedThreadPool(numThreads,
+            new ThreadFactoryBuilder()
+                .setDaemon(true)
+                .setNameFormat("manifest-committer-" + jobAttemptId + "-%d")
+                .build()));
+  }
+
+}

+ 240 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConstants.java

@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperationsThroughFileSystem;
+
+/**
+ * Public constants for the manifest committer.
+ * This includes all configuration options and their default values.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class ManifestCommitterConstants {
+
+  /**
+   * Suffix to use in manifest files in the manifest subdir.
+   * Value: {@value}.
+   */
+  public static final String MANIFEST_SUFFIX = "-manifest.json";
+
+  /**
+   * Prefix for summary files in the report dir. Call
+   */
+  public static final String SUMMARY_FILENAME_PREFIX = "summary-";
+
+  /**
+   * Format string used to build a summary file from a Job ID.
+   */
+  public static final String SUMMARY_FILENAME_FORMAT =
+      SUMMARY_FILENAME_PREFIX + "%s.json";
+
+  /**
+   * Suffix to use for temp files before renaming them.
+   * Value: {@value}.
+   */
+  public static final String TMP_SUFFIX = ".tmp";
+
+  /**
+   * Initial number of all app attempts.
+   * This is fixed in YARN; for Spark jobs the
+   * same number "0" is used.
+   */
+  public static final int INITIAL_APP_ATTEMPT_ID = 0;
+
+  /**
+   * Format string for building a job dir.
+   * Value: {@value}.
+   */
+  public static final String JOB_DIR_FORMAT_STR = "%s";
+
+  /**
+   * Format string for building a job attempt dir.
+   * This uses the job attempt number so previous versions
+   * can be found trivially.
+   * Value: {@value}.
+   */
+  public static final String JOB_ATTEMPT_DIR_FORMAT_STR = "%02d";
+
+  /**
+   * Name of directory under job attempt dir for manifests.
+   */
+  public static final String JOB_TASK_MANIFEST_SUBDIR = "manifests";
+
+  /**
+   * Name of directory under job attempt dir for task attempts.
+   */
+  public static final String JOB_TASK_ATTEMPT_SUBDIR = "tasks";
+
+
+  /**
+   * Committer classname as recorded in the committer _SUCCESS file.
+   */
+  public static final String MANIFEST_COMMITTER_CLASSNAME =
+      ManifestCommitter.class.getName();
+
+  /**
+   * Marker file to create on success: {@value}.
+   */
+  public static final String SUCCESS_MARKER = "_SUCCESS";
+
+  /** Default job marker option: {@value}. */
+  public static final boolean DEFAULT_CREATE_SUCCESSFUL_JOB_DIR_MARKER = true;
+
+  /**
+   * The limit to the number of committed objects tracked during
+   * job commits and saved to the _SUCCESS file.
+   * Value: {@value}.
+   */
+  public static final int SUCCESS_MARKER_FILE_LIMIT = 100;
+
+  /**
+   * The UUID for jobs: {@value}.
+   * This was historically created in Spark 1.x's SQL queries,
+   * but "went away".
+   * It has been restored in recent spark releases.
+   * If found: it is used instead of the MR job attempt ID.
+   */
+  public static final String SPARK_WRITE_UUID = "spark.sql.sources.writeJobUUID";
+
+  /**
+   * String to use as source of the job ID.
+   * This SHOULD be kept in sync with that of
+   * {@code AbstractS3ACommitter.JobUUIDSource}.
+   * Value: {@value}.
+   */
+  public static final String JOB_ID_SOURCE_MAPREDUCE = "JobID";
+
+  /**
+   * Prefix to use for config options: {@value}.
+   */
+  public static final String OPT_PREFIX = "mapreduce.manifest.committer.";
+
+  /**
+   * Should dir cleanup do parallel deletion of task attempt dirs
+   * before trying to delete the toplevel dirs.
+   * For GCS this may deliver speedup, while on ABFS it may avoid
+   * timeouts in certain deployments.
+   * Value: {@value}.
+   */
+  public static final String OPT_CLEANUP_PARALLEL_DELETE =
+      OPT_PREFIX + "cleanup.parallel.delete";
+
+  /**
+   * Default value:  {@value}.
+   */
+  public static final boolean OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT = true;
+
+  /**
+   * Threads to use for IO.
+   */
+  public static final String OPT_IO_PROCESSORS = OPT_PREFIX + "io.threads";
+
+  /**
+   * Default value:  {@value}.
+   */
+  public static final int OPT_IO_PROCESSORS_DEFAULT = 64;
+
+  /**
+   * Directory for saving job summary reports.
+   * These are the _SUCCESS files, but are saved even on
+   * job failures.
+   * Value: {@value}.
+   */
+  public static final String OPT_SUMMARY_REPORT_DIR =
+      OPT_PREFIX + "summary.report.directory";
+
+  /**
+   * Directory for moving manifests under for diagnostics.
+   * Value: {@value}.
+   */
+  public static final String OPT_DIAGNOSTICS_MANIFEST_DIR =
+      OPT_PREFIX + "diagnostics.manifest.directory";
+
+  /**
+   * Should the output be validated?
+   * This will check expected vs actual file lengths, and,
+   * if etags can be obtained, etags.
+   * Value: {@value}.
+   */
+  public static final String OPT_VALIDATE_OUTPUT = OPT_PREFIX + "validate.output";
+
+  /**
+   * Default value: {@value}.
+   */
+  public static final boolean OPT_VALIDATE_OUTPUT_DEFAULT = false;
+
+  /**
+   * Should job commit delete for files/directories at the targets
+   * of renames, and, if found, deleting them?
+   *
+   * This is part of the effective behavior of the FileOutputCommitter,
+   * however it adds an extra delete call per file being committed.
+   *
+   * If a job is writing to a directory which has only just been created
+   * or were unique filenames are being used, there is no need to perform
+   * this preparation.
+   * The recognition of newly created dirs is automatic.
+   *
+   * Value: {@value}.
+   */
+  public static final String OPT_DELETE_TARGET_FILES =
+      OPT_PREFIX + "delete.target.files";
+
+  /**
+   * Default value: {@value}.
+   */
+  public static final boolean OPT_DELETE_TARGET_FILES_DEFAULT = false;
+
+  /**
+   * Name of the factory: {@value}.
+   */
+  public static final String MANIFEST_COMMITTER_FACTORY =
+      ManifestCommitterFactory.class.getName();
+
+  /**
+   * Classname of the store operations; filesystems and tests
+   * may override.
+   * Value: {@value}.
+   */
+  public static final String OPT_STORE_OPERATIONS_CLASS = OPT_PREFIX + "store.operations.classname";
+
+  /**
+   * Default classname of the store operations.
+   * Value: {@value}.
+   */
+  public static final String STORE_OPERATIONS_CLASS_DEFAULT =
+      ManifestStoreOperationsThroughFileSystem.class.getName();
+
+  /**
+   * Stage attribute in audit context: {@value}.
+   */
+  public static final String CONTEXT_ATTR_STAGE = "st";
+
+  /**
+   * Task ID attribute in audit context: {@value}.
+   */
+  public static final String CONTEXT_ATTR_TASK_ATTEMPT_ID = "ta";
+
+  private ManifestCommitterConstants() {
+  }
+
+}

+ 57 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterFactory.java

@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.UNSUPPORTED_FS_SCHEMAS;
+
+/**
+ * This is the committer factory to register as the source of committers
+ * for the job/filesystem schema.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public class ManifestCommitterFactory extends PathOutputCommitterFactory {
+
+  /**
+   * Name of this factory.
+   */
+  public static final String NAME = ManifestCommitterFactory.class.getName();
+
+  @Override
+  public ManifestCommitter createOutputCommitter(final Path outputPath,
+      final TaskAttemptContext context) throws IOException {
+    // safety check. S3A does not support this, so fail fast.
+    final String scheme = outputPath.toUri().getScheme();
+    if (UNSUPPORTED_FS_SCHEMAS.contains(scheme)) {
+      throw new PathIOException(outputPath.toString(),
+          "This committer does not work with the filesystem of type " + scheme);
+    }
+    return new ManifestCommitter(outputPath, context);
+  }
+
+}

+ 264 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterStatisticNames.java

@@ -0,0 +1,264 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.statistics.StoreStatisticNames;
+
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_RENAME;
+
+/**
+ * Statistic names for committers.
+ * Please keep in sync with org.apache.hadoop.fs.s3a.Statistic
+ * so that S3A and manifest committers are in sync.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class ManifestCommitterStatisticNames {
+
+
+  /** Amount of data committed: {@value}. */
+  public static final String COMMITTER_BYTES_COMMITTED_COUNT =
+      "committer_bytes_committed";
+
+  /** Duration Tracking of time to commit an entire job: {@value}. */
+  public static final String COMMITTER_COMMIT_JOB =
+      "committer_commit_job";
+
+  /** Number of files committed: {@value}. */
+  public static final String COMMITTER_FILES_COMMITTED_COUNT =
+      "committer_files_committed";
+
+  /** "Count of successful tasks:: {@value}. */
+  public static final String COMMITTER_TASKS_COMPLETED_COUNT =
+      "committer_tasks_completed";
+
+  /** Count of failed tasks: {@value}. */
+  public static final String COMMITTER_TASKS_FAILED_COUNT =
+      "committer_tasks_failed";
+
+  /** Count of commits aborted: {@value}. */
+  public static final String COMMITTER_COMMITS_ABORTED_COUNT =
+      "committer_commits_aborted";
+
+  /** Count of commits reverted: {@value}. */
+  public static final String COMMITTER_COMMITS_REVERTED_COUNT =
+      "committer_commits_reverted";
+
+  /** Count of commits failed: {@value}. */
+  public static final String COMMITTER_COMMITS_FAILED =
+      "committer_commits" + StoreStatisticNames.SUFFIX_FAILURES;
+
+  /**
+   * The number of files in a task. This will be a MeanStatistic.
+   */
+  public static final String COMMITTER_FILE_COUNT_MEAN =
+      "committer_task_file_count";
+
+  /**
+   * File Size.
+   */
+  public static final String COMMITTER_FILE_SIZE_MEAN =
+      "committer_task_file_size";
+
+  /**
+   * What is a task attempt's directory count.
+   */
+  public static final String COMMITTER_TASK_DIRECTORY_COUNT_MEAN =
+      "committer_task_directory_count";
+
+  /**
+   * What is the depth of a task attempt's directory tree.
+   */
+  public static final String COMMITTER_TASK_DIRECTORY_DEPTH_MEAN =
+      "committer_task_directory_depth";
+
+  /**
+   * The number of files in a task. This will be a MeanStatistic.
+   */
+  public static final String COMMITTER_TASK_FILE_COUNT_MEAN =
+      "committer_task_file_count";
+
+  /**
+   * The number of files in a task. This will be a MeanStatistic.
+   */
+  public static final String COMMITTER_TASK_FILE_SIZE_MEAN =
+      "committer_task_file_size";
+
+  /**
+   * The size of manifest files. This will be a MeanStatistic.
+   * Useful to highlight whether this size is too large and
+   * effort to optimise for file IO and memory consumption
+   * justified.
+   */
+  public static final String COMMITTER_TASK_MANIFEST_FILE_SIZE =
+      "committer_task_manifest_file_size";
+
+  /**
+   * A file renamed during the commit operation {@value}.
+   */
+  public static final String OP_COMMIT_FILE_RENAME =
+      "commit_file_rename";
+
+  /**
+   * A file rename during the commit operation was recovered from a failure {@value}.
+   */
+  public static final String OP_COMMIT_FILE_RENAME_RECOVERED =
+      "commit_file_rename_recovered";
+
+  /** Directory creation {@value}. */
+  public static final String OP_CREATE_DIRECTORIES = "op_create_directories";
+
+  /** Creating a single directory {@value}. */
+  public static final String OP_CREATE_ONE_DIRECTORY =
+      "op_create_one_directory";
+
+  /**
+   * A file in the destination directory tree has been deleted.
+   *  {@value}.
+   */
+  public static final String OP_DELETE_FILE_UNDER_DESTINATION =
+      "op_delete_file_under_destination";
+
+  /** Directory scan {@value}. */
+  public static final String OP_DIRECTORY_SCAN = "op_directory_scan";
+
+  /**
+   * Overall job commit {@value}.
+   */
+  public static final String OP_STAGE_JOB_COMMIT = COMMITTER_COMMIT_JOB;
+
+  /** {@value}. */
+  public static final String OP_LOAD_ALL_MANIFESTS = "op_load_all_manifests";
+
+  /**
+   * Load a task manifest: {@value}.
+   */
+  public static final String OP_LOAD_MANIFEST = "op_load_manifest";
+
+  /**
+   * mkdir failure statistic: {@value}.
+   * This is incremented whenever mkdir() returns false, e.g because
+   * there was a file at the end of the path.
+   */
+  public static final String OP_MKDIRS_RETURNED_FALSE = "op_mkdir_returned_false";
+
+  /**
+   * msync statistic: {@value}.
+   * This should be the same as StoreStatisticNames.OP_MSYNC; it's
+   * duplicated just to isolate this committer into a single JAR
+   * for ease of testing.
+   */
+  public static final String OP_MSYNC = "op_msync";
+
+  /**
+   * Preparing ancestor dirs: {@value}.
+   * Probing for paths being files, and if true: deleting them.
+   */
+  public static final String OP_PREPARE_DIR_ANCESTORS = "op_prepare_dir_ancestors";
+
+  /** Rename a dir: {@value}. */
+  public static final String OP_RENAME_DIR = OP_RENAME;
+
+
+  /** Rename a file: {@value}. */
+  public static final String OP_RENAME_FILE = OP_RENAME;
+
+  /**
+   * Save a task manifest: {@value}.
+   */
+  public static final String OP_SAVE_TASK_MANIFEST =
+      "task_stage_save_task_manifest";
+
+  /**
+   * Task abort: {@value}.
+   */
+  public static final String OP_STAGE_TASK_ABORT_TASK
+      = "task_stage_abort_task";
+
+  /**
+   * Job abort: {@value}.
+   */
+  public static final String OP_STAGE_JOB_ABORT = "job_stage_abort";
+
+  /**
+   * Job cleanup: {@value}.
+   */
+  public static final String OP_STAGE_JOB_CLEANUP = "job_stage_cleanup";
+
+  /**
+   * Prepare Directories Stage: {@value}.
+   */
+  public static final String OP_STAGE_JOB_CREATE_TARGET_DIRS =
+      "job_stage_create_target_dirs";
+
+  /**
+   * Load Manifest Stage: {@value}.
+   */
+  public static final String OP_STAGE_JOB_LOAD_MANIFESTS =
+      "job_stage_load_manifests";
+
+  /**
+   * Rename files stage duration: {@value}.
+   */
+  public static final String OP_STAGE_JOB_RENAME_FILES =
+      "job_stage_rename_files";
+
+
+  /**
+   * Job Setup Stage: {@value}.
+   */
+  public static final String OP_STAGE_JOB_SETUP = "job_stage_setup";
+
+  /**
+   * Job saving _SUCCESS marker Stage: {@value}.
+   */
+  public static final String OP_STAGE_JOB_SAVE_SUCCESS =
+      "job_stage_save_success_marker";
+
+  /**
+   * Output Validation (within job commit) Stage: {@value}.
+   */
+  public static final String OP_STAGE_JOB_VALIDATE_OUTPUT =
+      "job_stage_optional_validate_output";
+
+  /**
+   * Task saving manifest file Stage: {@value}.
+   */
+  public static final String OP_STAGE_TASK_SAVE_MANIFEST =
+      "task_stage_save_manifest";
+
+  /**
+   * Task Setup Stage: {@value}.
+   */
+  public static final String OP_STAGE_TASK_SETUP = "task_stage_setup";
+
+  /**
+   * Task Commit Stage: {@value}.
+   */
+  public static final String OP_STAGE_TASK_COMMIT = "task_stage_commit";
+
+  /** Task Scan directory Stage: {@value}. */
+  public static final String OP_STAGE_TASK_SCAN_DIRECTORY
+      = "task_stage_scan_directory";
+
+  private ManifestCommitterStatisticNames() {
+  }
+}

+ 137 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/AbstractManifestData.java

@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
+
+import javax.annotation.Nullable;
+import java.io.IOException;
+import java.io.Serializable;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.util.JsonSerialization;
+
+import static java.util.Objects.requireNonNull;
+
+/**
+ * Class for single/multiple commit data structures.
+ */
+@SuppressWarnings("serial")
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public abstract class AbstractManifestData<T extends AbstractManifestData>
+    implements Serializable, IOStatisticsSource {
+
+
+  /**
+   * Convert a path to a string which can be included in the JSON.
+   * @param path path
+   * @return a string value, or, if path==null, null.
+   */
+  public static String marshallPath(@Nullable Path path) {
+    return path != null
+        ? path.toUri().toString()
+        : null;
+  }
+
+  /**
+   * Convert a string path to Path type, by way of a URI.
+   * @param path path as a string
+   * @return path value
+   * @throws RuntimeException marshalling failure.
+   */
+  public static Path unmarshallPath(String path) {
+    try {
+      return new Path(new URI(requireNonNull(path, "No path")));
+    } catch (URISyntaxException e) {
+      throw new RuntimeException(
+          "Failed to parse \"" + path + "\" : " + e,
+          e);
+
+    }
+  }
+
+  /**
+   * Validate the data: those fields which must be non empty, must be set.
+   * @return the validated instance.
+   * @throws IOException if the data is invalid
+   */
+  public abstract T validate() throws IOException;
+
+  /**
+   * Serialize to JSON and then to a byte array, after performing a
+   * preflight validation of the data to be saved.
+   * @return the data in a persistable form.
+   * @throws IOException serialization problem or validation failure.
+   */
+  public abstract byte[] toBytes() throws IOException;
+
+  /**
+   * Save to a hadoop filesystem.
+   * @param fs filesystem
+   * @param path path
+   * @param overwrite should any existing file be overwritten
+   * @throws IOException IO exception
+   */
+  public abstract void save(FileSystem fs, Path path, boolean overwrite)
+      throws IOException;
+
+  /**
+   * Get a (usually shared) JSON serializer.
+   * @return a serializer. Call
+   */
+  public abstract JsonSerialization<T> createSerializer();
+
+  /**
+   * Verify that all instances in a collection are of the given class.
+   * @param it iterator
+   * @param classname classname to require
+   * @throws IOException on a failure
+   */
+  void validateCollectionClass(Iterable it, Class classname)
+      throws IOException {
+    for (Object o : it) {
+      verify(o.getClass().equals(classname),
+          "Collection element is not a %s: %s", classname, o.getClass());
+    }
+  }
+
+  /**
+   * Verify that a condition holds.
+   * @param expression expression which must be true
+   * @param message message to raise on a failure
+   * @param args arguments for the message formatting
+   * @throws IOException on a failure
+   */
+
+  static void verify(boolean expression,
+      String message,
+      Object... args) throws IOException {
+    if (!expression) {
+      throw new IOException(String.format(message, args));
+    }
+  }
+}

+ 44 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/DiagnosticKeys.java

@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Diagnostic keys in the manifests.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class DiagnosticKeys {
+  /**
+   * Attribute added to diagnostics in _SUCCESS file.
+   */
+  public static final String PRINCIPAL = "principal";
+  public static final String STAGE = "stage";
+  public static final String EXCEPTION = "exception";
+  public static final String STACKTRACE = "stacktrace";
+
+
+  /** Directory where manifests were renamed: {@value}. */
+  public static final String MANIFESTS = "manifests";
+
+  private DiagnosticKeys() {
+  }
+}

+ 202 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/DirEntry.java

@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Objects;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.Path;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.marshallPath;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.unmarshallPath;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.verify;
+
+/**
+ * A directory entry in the task manifest.
+ * Uses shorter field names for smaller files.
+ * Hash and equals are on dir name only; there's no real expectation
+ * that those operations are needed.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class DirEntry implements Serializable {
+
+  private static final long serialVersionUID = 5658520530209859765L;
+
+  /**
+   * Destination directory.
+   */
+  @JsonProperty("d")
+  private String dir;
+
+  /**
+   * Type of dest entry as found when probed for in task commit.
+   */
+  @JsonProperty("t")
+  private int type;
+
+  /**
+   * Level in the treewalk.
+   */
+  @JsonProperty("l")
+  private int level;
+
+  /**
+   * Constructor only for use by jackson.
+   * Do Not Delete.
+   */
+  private DirEntry() {
+  }
+
+  /**
+   * Construct an entry.
+   *
+   * @param dir destination path.
+   * @param type type of dest entry
+   * @param level Level in the treewalk.
+   *
+   */
+  public DirEntry(
+      final String dir,
+      final int type,
+      final int level) {
+    this.dir = requireNonNull(dir);
+    this.type = type;
+    this.level = level;
+  }
+
+  /**
+   * Construct an entry.
+   *
+   * @param dir destination path.
+   * @param type type of dest entry
+   * @param level Level in the treewalk.
+   *
+   */
+  public DirEntry(
+      final Path dir,
+      final int type,
+      final int level) {
+    this(marshallPath(dir), type, level);
+  }
+
+  public void setDir(final String dir) {
+    this.dir = dir;
+  }
+
+  public String getDir() {
+    return dir;
+  }
+
+  @JsonIgnore
+  public Path getDestPath() {
+    return unmarshallPath(dir);
+  }
+
+  public int getType() {
+    return type;
+  }
+
+  public void setType(final int type) {
+    this.type = type;
+  }
+
+  public void setLevel(final int level) {
+    this.level = level;
+  }
+
+  public int getLevel() {
+    return level;
+  }
+
+  @JsonIgnore
+  public EntryStatus getStatus() {
+    return EntryStatus.toEntryStatus(type);
+  }
+
+  @JsonIgnore
+  public void setStatus(EntryStatus status) {
+    setType(status.ordinal());
+  }
+  public void validate() throws IOException {
+    final String s = toString();
+    verify(dir != null && dir.length() > 0,
+        "destination path is missing from " + s);
+    verify(type >= 0,
+        "Invalid type in " + s);
+    verify(level >= 0,
+        "Invalid level in " + s);
+  }
+
+  @Override
+  public String toString() {
+    return "DirEntry{" +
+        "dir='" + dir + '\'' +
+        ", type=" + type +
+        ", level=" + level +
+        '}';
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    DirEntry dirEntry = (DirEntry) o;
+    return dir.equals(dirEntry.dir);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(dir);
+  }
+
+  /**
+   * A directory entry.
+   * @param dest destination path.
+   * @param type type
+   * @param level Level in the treewalk.
+   * @return an entry
+   */
+  public static DirEntry dirEntry(Path dest, int type, int level) {
+    return new DirEntry(dest, type, level);
+  }
+
+  /**
+   * A directory entry.
+   * @param dest destination path.
+   * @param type type
+   * @param level Level in the treewalk.
+   * @return an entry
+   */
+  public static DirEntry dirEntry(Path dest, EntryStatus type, int level) {
+    return dirEntry(dest, type.ordinal(), level);
+  }
+
+}

+ 87 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/EntryStatus.java

@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
+
+import javax.annotation.Nullable;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FileStatus;
+
+/**
+ * Status of a file or dir entry, designed to be marshalled as
+ * an integer -the ordinal value of the enum is the
+ * wire value.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public enum EntryStatus {
+
+  unknown,
+  not_found,
+  file,
+  dir,
+  created_dir;
+
+  /**
+   * Go from a marshalled type to a status value.
+   * Any out of range value is converted to unknown.
+   * @param type type
+   * @return the status value.
+   */
+  public static EntryStatus toEntryStatus(int type) {
+    switch (type) {
+    case 1:
+      return not_found;
+    case 2:
+      return file;
+    case 3:
+      return dir;
+    case 4:
+      return created_dir;
+    case 0:
+    default:
+      return unknown;
+    }
+  }
+
+
+  /**
+   * Go from the result of a getFileStatus call or
+   * listing entry to a status.
+   * A null argument is mapped to {@link #not_found}
+   * @param st file status
+   * @return the status enum.
+   */
+  public static EntryStatus toEntryStatus(@Nullable FileStatus st) {
+
+    if (st == null) {
+      return not_found;
+    }
+    if (st.isDirectory()) {
+      return dir;
+    }
+    if (st.isFile()) {
+      return file;
+    }
+    return unknown;
+  }
+
+
+}

+ 189 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/FileEntry.java

@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Objects;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.Path;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.marshallPath;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.unmarshallPath;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.verify;
+
+/**
+ * A File entry in the task manifest.
+ * Uses shorter field names for smaller files.
+ */
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public final class FileEntry implements Serializable {
+
+  private static final long serialVersionUID = -550288489009777867L;
+
+  @JsonProperty("s")
+  private String source;
+
+  @JsonProperty("d")
+  private String dest;
+
+  @JsonProperty("z")
+  private long size;
+
+  /**
+   * Etag value if we can extract this.
+   */
+  @JsonProperty("e")
+  private String etag;
+
+  /**
+   * Constructor only for use by jackson.
+   * Do Not Delete.
+   */
+  private FileEntry() {
+  }
+
+  /**
+   * Construct an entry.
+   * @param source source path.
+   * @param dest destination path.
+   * @param size file size.
+   * @param etag optional etag
+   */
+  public FileEntry(
+      final String source,
+      final String dest,
+      final long size,
+      final String etag) {
+    this.source = source;
+    this.dest = dest;
+    this.size = size;
+    this.etag = etag;
+  }
+
+
+  /**
+   * Construct an entry.
+   * @param source source path.
+   * @param dest destination path.
+   * @param size file size.
+   * @param etag optional etag
+   */
+  public FileEntry(
+      final Path source,
+      final Path dest,
+      final long size,
+      final String etag) {
+    this(marshallPath(source), marshallPath(dest), size, etag);
+  }
+
+
+  public void setSource(final String source) {
+    this.source = source;
+  }
+
+  public String getSource() {
+    return source;
+  }
+
+  @JsonIgnore
+  public Path getSourcePath() {
+    return unmarshallPath(source);
+  }
+
+  public void setDest(final String dest) {
+    this.dest = dest;
+  }
+
+  public String getDest() {
+    return dest;
+  }
+
+  @JsonIgnore
+  public Path getDestPath() {
+    return unmarshallPath(dest);
+  }
+
+  public long getSize() {
+    return size;
+  }
+
+  public void setSize(final long size) {
+    this.size = size;
+  }
+
+  public String getEtag() {
+    return etag;
+  }
+
+  public void setEtag(final String etag) {
+    this.etag = etag;
+  }
+
+  public void validate() throws IOException {
+    final String s = toString();
+    verify(source != null && source.length() > 0,
+        "Source is missing from " + s);
+    verify(dest != null && dest.length() > 0,
+        "Source is missing from " + s);
+    verify(size >= 0,
+        "Invalid size in " + s);
+  }
+
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder(
+        "FileOrDirEntry{");
+    sb.append("source='").append(source).append('\'');
+    sb.append(", dest='").append(dest).append('\'');
+    sb.append(", size=").append(size);
+    sb.append(", etag='").append(etag).append('\'');
+    sb.append('}');
+    return sb.toString();
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    FileEntry that = (FileEntry) o;
+    return size == that.size && source.equals(that.source) && dest.equals(
+        that.dest) &&
+        Objects.equals(etag, that.etag);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(source, dest);
+  }
+
+}

+ 144 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestPrinter.java

@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
+
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ExitUtil;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
+
+/**
+ * Tool to print a manifest.
+ */
+public class ManifestPrinter extends Configured implements Tool {
+
+  private static final String USAGE = "ManifestPrinter <success-file>";
+
+  /**
+   * Output for printing.
+   */
+  private final PrintStream out;
+
+  /**
+   * Print to System.out.
+   */
+  public ManifestPrinter() {
+    this(null, System.out);
+  }
+
+  /**
+   * Print to the supplied stream.
+   * @param conf configuration
+   * @param out output
+   */
+  public ManifestPrinter(Configuration conf, PrintStream out) {
+    super(conf);
+    this.out = out;
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    if (args.length != 1) {
+      printUsage();
+      return -1;
+    }
+    Path path = new Path(args[0]);
+    loadAndPrintManifest(path.getFileSystem(getConf()), path);
+    return 0;
+  }
+
+  /**
+   * Load and print a manifest.
+   * @param fs filesystem.
+   * @param path path
+   * @throws IOException failure to load
+   * @return the manifest
+   */
+  public ManifestSuccessData loadAndPrintManifest(FileSystem fs, Path path)
+      throws IOException {
+    // load the manifest
+    println("Manifest file: %s", path);
+    final ManifestSuccessData success = ManifestSuccessData.load(fs, path);
+
+    printManifest(success);
+    return success;
+  }
+
+  private void printManifest(ManifestSuccessData success) {
+    field("succeeded", success.getSuccess());
+    field("created", success.getDate());
+    field("committer", success.getCommitter());
+    field("hostname", success.getHostname());
+    field("description", success.getDescription());
+    field("jobId", success.getJobId());
+    field("jobIdSource", success.getJobIdSource());
+    field("stage", success.getStage());
+    println("Diagnostics\n%s",
+        success.dumpDiagnostics("  ", " = ", "\n"));
+    println("Statistics:\n%s",
+        ioStatisticsToPrettyString(success.getIOStatistics()));
+    out.flush();
+  }
+
+  private void printUsage() {
+    println(USAGE);
+  }
+
+  /**
+   * Print a line to the output stream.
+   * @param format format string
+   * @param args arguments.
+   */
+  private void println(String format, Object... args) {
+    out.format(format, args);
+    out.println();
+  }
+
+  /**
+   * Print a field, if non-null.
+   * @param name field name.
+   * @param value value.
+   */
+  private void field(String name, Object value) {
+    if (value != null) {
+      println("%s: %s", name, value);
+    }
+  }
+
+  /**
+   * Entry point.
+   */
+  public static void main(String[] argv) throws Exception {
+
+    try {
+      int res = ToolRunner.run(new ManifestPrinter(), argv);
+      System.exit(res);
+    } catch (ExitUtil.ExitException e) {
+      ExitUtil.terminate(e);
+    }
+  }
+}

+ 493 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestSuccessData.java

@@ -0,0 +1,493 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
+import org.apache.hadoop.fs.statistics.IOStatisticsSupport;
+import org.apache.hadoop.util.JsonSerialization;
+
+/**
+ * Summary data saved into a {@code _SUCCESS} marker file.
+ *
+ * This is a copy of the S3A committer success data format, with
+ * a goal of being/remaining compatible.
+ * This makes it easier for tests in downstream modules to
+ * be able to parse the success files from any of the committers.
+ *
+ * This should be considered public; it is based on the S3A
+ * format, which has proven stable over time.
+ *
+ * The JSON format SHOULD be considered public and evolving
+ * with compatibility across versions.
+ *
+ * All the Java serialization data is different and may change
+ * across versions with no stability guarantees other than
+ * "manifest summaries MAY be serialized between processes with
+ * the exact same version of this binary on their classpaths."
+ * That is sufficient for testing in Spark.
+ *
+ * To aid with Java serialization, the maps and lists are
+ * exclusively those which serialize well.
+ * IOStatisticsSnapshot has a lot of complexity in marshalling
+ * there; this class doesn't worry about concurrent access
+ * so is simpler.
+ *
+ */
+@SuppressWarnings({"unused", "CollectionDeclaredAsConcreteClass"})
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class ManifestSuccessData
+    extends AbstractManifestData<ManifestSuccessData> {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ManifestSuccessData.class);
+
+  /**
+   * Supported version value: {@value}.
+   * If this is changed the value of {@link #serialVersionUID} will change,
+   * to avoid deserialization problems.
+   */
+  public static final int VERSION = 1;
+
+  /**
+   * Serialization ID: {@value}.
+   */
+  private static final long serialVersionUID = 4755993198698104084L + VERSION;
+
+  /**
+   * Name to include in persisted data, so as to differentiate from
+   * any other manifests: {@value}.
+   */
+  public static final String NAME
+      = "org.apache.hadoop.fs.s3a.commit.files.SuccessData/" + VERSION;
+
+  /**
+   * Name of file; includes version marker.
+   */
+  private String name;
+
+  /** Timestamp of creation. */
+  private long timestamp;
+
+  /**
+   * Did this succeed?
+   * It is implicitly true in a _SUCCESS file, but if the file
+   * is also saved to a log dir, then it depends on the outcome
+   */
+  private boolean success = true;
+
+  /** Timestamp as date string; no expectation of parseability. */
+  private String date;
+
+  /**
+   * Host which created the file (implicitly: committed the work).
+   */
+  private String hostname;
+
+  /**
+   * Committer name.
+   */
+  private String committer;
+
+  /**
+   * Description text.
+   */
+  private String description;
+
+  /** Job ID, if known. */
+  private String jobId = "";
+
+  /**
+   * Source of the job ID.
+   */
+  private String jobIdSource = "";
+
+  /**
+   * Metrics.
+   * Uses a treemap for serialization.
+   */
+  private TreeMap<String, Long> metrics = new TreeMap<>();
+
+  /**
+   * Diagnostics information.
+   * Uses a treemap for serialization.
+   */
+  private TreeMap<String, String> diagnostics = new TreeMap<>();
+
+  /**
+   * Filenames in the commit.
+   */
+  private ArrayList<String> filenames = new ArrayList<>(0);
+
+  /**
+   * IOStatistics.
+   */
+  @JsonProperty("iostatistics")
+  private IOStatisticsSnapshot iostatistics = new IOStatisticsSnapshot();
+
+  /**
+   * State (committed, aborted).
+   */
+  private String state;
+
+  /**
+   * Stage: last stage executed.
+   */
+  private String stage;
+
+  @Override
+  public ManifestSuccessData validate() throws IOException {
+    verify(name != null,
+        "Incompatible file format: no 'name' field");
+    verify(NAME.equals(name),
+        "Incompatible file format: " + name);
+    return this;
+  }
+
+  @Override
+  public JsonSerialization<ManifestSuccessData> createSerializer() {
+    return serializer();
+  }
+
+  @Override
+  public byte[] toBytes() throws IOException {
+    return serializer().toBytes(this);
+  }
+
+  /**
+   * To JSON.
+   * @return json string value.
+   * @throws IOException failure
+   */
+  public String toJson() throws IOException {
+    return serializer().toJson(this);
+  }
+
+  @Override
+  public void save(FileSystem fs, Path path, boolean overwrite)
+      throws IOException {
+    // always set the name field before being saved.
+    name = NAME;
+    serializer().save(fs, path, this, overwrite);
+  }
+
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder(
+        "ManifestSuccessData{");
+    sb.append("committer='").append(committer).append('\'');
+    sb.append(", hostname='").append(hostname).append('\'');
+    sb.append(", description='").append(description).append('\'');
+    sb.append(", date='").append(date).append('\'');
+    sb.append(", filenames=[").append(
+        StringUtils.join(filenames, ", "))
+        .append("]");
+    sb.append('}');
+    return sb.toString();
+  }
+
+  /**
+   * Dump the metrics (if any) to a string.
+   * The metrics are sorted for ease of viewing.
+   * @param prefix prefix before every entry
+   * @param middle string between key and value
+   * @param suffix suffix to each entry
+   * @return the dumped string
+   */
+  public String dumpMetrics(String prefix, String middle, String suffix) {
+    return joinMap(metrics, prefix, middle, suffix);
+  }
+
+  /**
+   * Dump the diagnostics (if any) to a string.
+   * @param prefix prefix before every entry
+   * @param middle string between key and value
+   * @param suffix suffix to each entry
+   * @return the dumped string
+   */
+  public String dumpDiagnostics(String prefix, String middle, String suffix) {
+    return joinMap(diagnostics, prefix, middle, suffix);
+  }
+
+  /**
+   * Join any map of string to value into a string, sorting the keys first.
+   * @param map map to join
+   * @param prefix prefix before every entry
+   * @param middle string between key and value
+   * @param suffix suffix to each entry
+   * @return a string for reporting.
+   */
+  protected static String joinMap(Map<String, ?> map,
+      String prefix,
+      String middle, String suffix) {
+    if (map == null) {
+      return "";
+    }
+    List<String> list = new ArrayList<>(map.keySet());
+    Collections.sort(list);
+    StringBuilder sb = new StringBuilder(list.size() * 32);
+    for (String k : list) {
+      sb.append(prefix)
+          .append(k)
+          .append(middle)
+          .append(map.get(k))
+          .append(suffix);
+    }
+    return sb.toString();
+  }
+
+  /**
+   * Load an instance from a file, then validate it.
+   * @param fs filesystem
+   * @param path path
+   * @return the loaded instance
+   * @throws IOException IO failure
+   */
+  public static ManifestSuccessData load(FileSystem fs, Path path)
+      throws IOException {
+    LOG.debug("Reading success data from {}", path);
+    ManifestSuccessData instance = serializer().load(fs, path);
+    instance.validate();
+    return instance;
+  }
+
+  /**
+   * Get a JSON serializer for this class.
+   * @return a serializer.
+   */
+  public static JsonSerialization<ManifestSuccessData> serializer() {
+    return new JsonSerialization<>(ManifestSuccessData.class, false, true);
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  /** @return timestamp of creation. */
+  public long getTimestamp() {
+    return timestamp;
+  }
+
+  public void setTimestamp(long timestamp) {
+    this.timestamp = timestamp;
+  }
+
+  /** @return timestamp as date; no expectation of parseability. */
+  public String getDate() {
+    return date;
+  }
+
+  public void setDate(String date) {
+    this.date = date;
+  }
+
+  /**
+   * @return host which created the file (implicitly: committed the work).
+   */
+  public String getHostname() {
+    return hostname;
+  }
+
+  public void setHostname(String hostname) {
+    this.hostname = hostname;
+  }
+
+  /**
+   * @return committer name.
+   */
+  public String getCommitter() {
+    return committer;
+  }
+
+  public void setCommitter(String committer) {
+    this.committer = committer;
+  }
+
+  /**
+   * @return any description text.
+   */
+  public String getDescription() {
+    return description;
+  }
+
+  public void setDescription(String description) {
+    this.description = description;
+  }
+
+  /**
+   * @return any metrics.
+   */
+  public Map<String, Long> getMetrics() {
+    return metrics;
+  }
+
+  public void setMetrics(TreeMap<String, Long> metrics) {
+    this.metrics = metrics;
+  }
+
+  /**
+   * @return a list of filenames in the commit.
+   */
+  public List<String> getFilenames() {
+    return filenames;
+  }
+
+  /**
+   * Get the list of filenames as paths.
+   * @return the paths.
+   */
+  @JsonIgnore
+  public List<Path> getFilenamePaths() {
+    return getFilenames().stream()
+        .map(AbstractManifestData::unmarshallPath)
+        .collect(Collectors.toList());
+  }
+
+  /**
+   * Set the list of filename paths.
+   */
+  @JsonIgnore
+  public void setFilenamePaths(List<Path> paths) {
+    setFilenames(new ArrayList<>(
+        paths.stream()
+            .map(AbstractManifestData::marshallPath)
+            .collect(Collectors.toList())));
+  }
+
+  public void setFilenames(ArrayList<String> filenames) {
+    this.filenames = filenames;
+  }
+
+  public Map<String, String> getDiagnostics() {
+    return diagnostics;
+  }
+
+  public void setDiagnostics(TreeMap<String, String> diagnostics) {
+    this.diagnostics = diagnostics;
+  }
+
+  /**
+   * Add a diagnostics entry.
+   * @param key name
+   * @param value value
+   */
+  public void putDiagnostic(String key, String value) {
+    diagnostics.put(key, value);
+  }
+
+  /** @return Job ID, if known. */
+  public String getJobId() {
+    return jobId;
+  }
+
+  public void setJobId(String jobId) {
+    this.jobId = jobId;
+  }
+
+  public String getJobIdSource() {
+    return jobIdSource;
+  }
+
+  public void setJobIdSource(final String jobIdSource) {
+    this.jobIdSource = jobIdSource;
+  }
+
+  @Override
+  public IOStatisticsSnapshot getIOStatistics() {
+    return iostatistics;
+  }
+
+  public void setIOStatistics(final IOStatisticsSnapshot ioStatistics) {
+    this.iostatistics = ioStatistics;
+  }
+
+  /**
+   * Set the IOStatistics to a snapshot of the source.
+   * @param iostats. Statistics; may be null.
+   */
+  public void snapshotIOStatistics(IOStatistics iostats) {
+    setIOStatistics(IOStatisticsSupport.snapshotIOStatistics(iostats));
+  }
+
+  /**
+   * Set the success flag.
+   * @param success did the job succeed?
+   */
+  public void setSuccess(boolean success) {
+    this.success = success;
+  }
+
+  /**
+   * Get the success flag.
+   * @return did the job succeed?
+   */
+  public boolean getSuccess() {
+    return success;
+  }
+
+  public String getState() {
+    return state;
+  }
+
+  public void setState(String state) {
+    this.state = state;
+  }
+
+  public String getStage() {
+    return stage;
+  }
+
+  /**
+   * Note a failure by setting success flag to false,
+   * then add the exception to the diagnostics.
+   * @param thrown throwable
+   */
+  public void recordJobFailure(Throwable thrown) {
+    setSuccess(false);
+    String stacktrace = ExceptionUtils.getStackTrace(thrown);
+    diagnostics.put(DiagnosticKeys.EXCEPTION, thrown.toString());
+    diagnostics.put(DiagnosticKeys.STACKTRACE, stacktrace);
+  }
+}

+ 365 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/TaskManifest.java

@@ -0,0 +1,365 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
+
+import javax.annotation.Nullable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
+import org.apache.hadoop.util.JsonSerialization;
+
+/**
+ * This is the manifest of files which were created by
+ * this task attempt.
+ *
+ * Versioning:
+ * In a rolling cluster update, MR or Spark workers deployed on a newer
+ * node (and/or with a newer version of artifacts in a cluster-FS hosted
+ * tar.gz file) may be a later version of this class than that of
+ * job committer.
+ * If any changes are made to the manifest which are backwards compatible,
+ * this new manifest can still be loaded from JSON and processed.
+ *
+ * If the manifest is no longer compatible, the job output may
+ * be invalid.
+ *
+ * It is CRITICAL that the {@link #VERSION} constant is updated whenever
+ * such an incompatible change is made.
+ */
+@SuppressWarnings("unused")
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class TaskManifest extends AbstractManifestData<TaskManifest> {
+
+  /**
+   * Supported version value: {@value}.
+   * If this is changed the value of {@code serialVersionUID} will change,
+   * to avoid deserialization problems.
+   */
+  public static final int VERSION = 1;
+
+  /**
+   * Manifest type.
+   */
+  public static final String TYPE =
+      "org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest/"
+      + VERSION;
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(TaskManifest.class);
+
+  /**
+   * Serialization version.
+   */
+  private static final long serialVersionUID = 7090285511966046094L + VERSION;
+
+  /**
+   * Manifest type.
+   */
+  @JsonProperty("type")
+  private String type = TYPE;
+
+  /** Version marker. */
+  @JsonProperty("version")
+  private int version = VERSION;
+
+  /**
+   * Job ID; constant over multiple attempts.
+   */
+  @JsonProperty("jobId")
+  private String jobId;
+
+  /**
+   * Number of the job attempt; starts at zero.
+   */
+  @JsonProperty("jobAttemptNumber")
+  private int jobAttemptNumber;
+
+  /**
+   * Task Attempt ID.
+   */
+  @JsonProperty("taskID")
+  private String taskID;
+
+  /**
+   * Task Attempt ID.
+   */
+  @JsonProperty("taskAttemptID")
+  private String taskAttemptID;
+
+  /**
+   * The task attempt directory.
+   */
+  @JsonProperty("taskAttemptDir")
+  private String taskAttemptDir;
+
+  /**
+   * The list of files to commit from this task attempt, including
+   * precalculated destination and size.
+   */
+  @JsonProperty("files")
+  private final List<FileEntry> filesToCommit = new ArrayList<>();
+
+  /**
+   * The list of directories needed by this task attempt, both
+   * source and destination.
+   * All these directories must exist in the destination before any of
+   * the files can be renamed there.
+   */
+  @JsonProperty("directories")
+  private final List<DirEntry> destDirectories = new ArrayList<>();
+
+  /**
+   * Any custom extra data committers may choose to add.
+   */
+  private final Map<String, String> extraData = new HashMap<>(0);
+
+  /**
+   * IOStatistics.
+   */
+  @JsonProperty("iostatistics")
+  private IOStatisticsSnapshot iostatistics = new IOStatisticsSnapshot();
+
+  /**
+   * Empty constructor; will be used by jackson as well as in application
+   * code.
+   */
+  public TaskManifest() {
+  }
+
+  public String getType() {
+    return type;
+  }
+
+  public void setType(String type) {
+    this.type = type;
+  }
+
+  public int getVersion() {
+    return version;
+  }
+
+  public void setVersion(int version) {
+    this.version = version;
+  }
+
+  @Override
+  public IOStatisticsSnapshot getIOStatistics() {
+    return iostatistics;
+  }
+
+  public void setIOStatistics(
+      @Nullable final IOStatisticsSnapshot ioStatistics) {
+    this.iostatistics = ioStatistics;
+  }
+
+  public String getJobId() {
+    return jobId;
+  }
+
+  public void setJobId(final String jobId) {
+    this.jobId = jobId;
+  }
+
+  public int getJobAttemptNumber() {
+    return jobAttemptNumber;
+  }
+
+  public void setJobAttemptNumber(final int jobAttemptNumber) {
+    this.jobAttemptNumber = jobAttemptNumber;
+  }
+
+  public String getTaskID() {
+    return taskID;
+  }
+
+  public void setTaskID(final String taskID) {
+    this.taskID = taskID;
+  }
+
+  public String getTaskAttemptID() {
+    return taskAttemptID;
+  }
+
+  public void setTaskAttemptID(final String taskAttemptID) {
+    this.taskAttemptID = taskAttemptID;
+  }
+
+  public String getTaskAttemptDir() {
+    return taskAttemptDir;
+  }
+
+  public void setTaskAttemptDir(final String taskAttemptDir) {
+    this.taskAttemptDir = taskAttemptDir;
+  }
+
+  /**
+   * Add a file to the list of files to commit.
+   * @param entry entry  to add
+   */
+  public void addFileToCommit(FileEntry entry) {
+    filesToCommit.add(entry);
+  }
+
+  public List<FileEntry> getFilesToCommit() {
+    return filesToCommit;
+  }
+
+  /**
+   * Calculate the total amount of data which will be committed.
+   * @return the sum of sizes of all files to commit.
+   */
+  @JsonIgnore
+  public long getTotalFileSize() {
+    return filesToCommit.stream().mapToLong(FileEntry::getSize).sum();
+  }
+
+  /**
+   * All the destination directories.
+   * @return directory list.
+   */
+  public List<DirEntry> getDestDirectories() {
+    return destDirectories;
+  }
+
+  /**
+   * Add a directory to the list of directories to create.
+   * @param entry entry  to add
+   */
+  public void addDirectory(DirEntry entry) {
+    destDirectories.add(entry);
+  }
+
+  public Map<String, String> getExtraData() {
+    return extraData;
+  }
+
+  @Override
+  public byte[] toBytes() throws IOException {
+    return serializer().toBytes(this);
+  }
+
+  /**
+   * To JSON.
+   * @return json string value.
+   * @throws IOException failure
+   */
+  public String toJson() throws IOException {
+    return serializer().toJson(this);
+  }
+
+  @Override
+  public void save(FileSystem fs, Path path, boolean overwrite)
+      throws IOException {
+    serializer().save(fs, path, this, overwrite);
+  }
+
+  /**
+   * Validate the data: those fields which must be non empty, must be set.
+   * @throws IOException if the data is invalid
+   * @return
+   */
+  public TaskManifest validate() throws IOException {
+    verify(TYPE.equals(type), "Wrong type: %s", type);
+    verify(version == VERSION, "Wrong version: %s", version);
+    validateCollectionClass(extraData.keySet(), String.class);
+    validateCollectionClass(extraData.values(), String.class);
+    Set<String> destinations = new HashSet<>(filesToCommit.size());
+    validateCollectionClass(filesToCommit, FileEntry.class);
+    for (FileEntry c : filesToCommit) {
+      c.validate();
+      verify(!destinations.contains(c.getDest()),
+          "Destination %s is written to by more than one pending commit",
+          c.getDest());
+      destinations.add(c.getDest());
+    }
+    return this;
+  }
+
+  /**
+   * Get a JSON serializer for this class.
+   * @return a serializer.
+   */
+  @Override
+  public JsonSerialization<TaskManifest> createSerializer() {
+    return serializer();
+  }
+
+  /**
+   * Create a JSON serializer for this class.
+   * @return a serializer.
+   */
+  public static JsonSerialization<TaskManifest> serializer() {
+    return new JsonSerialization<>(TaskManifest.class, false, true);
+  }
+
+  /**
+   * Load an instance from a file, then validate it.
+   * @param fs filesystem
+   * @param path path
+   * @return the loaded instance
+   * @throws IOException IO failure/the data is invalid
+   */
+  public static TaskManifest load(FileSystem fs, Path path)
+      throws IOException {
+    LOG.debug("Reading Manifest in file {}", path);
+    return serializer().load(fs, path).validate();
+  }
+
+  /**
+   * Load an instance from a file, then validate it.
+   * If loading through a listing; use this API so that filestatus
+   * hints can be used.
+   * @param serializer serializer.
+   * @param fs filesystem
+   * @param path path to load from
+   * @param status status of file to load
+   * @return the loaded instance
+   * @throws IOException IO failure/the data is invalid
+   */
+  public static TaskManifest load(
+      JsonSerialization<TaskManifest> serializer,
+      FileSystem fs,
+      Path path,
+      FileStatus status)
+      throws IOException {
+    LOG.debug("Reading Manifest in file {}", path);
+    return serializer.load(fs, path, status)
+        .validate();
+  }
+
+}

+ 41 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/package-info.java

@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Persistence formats.
+ * These are the persistence formats used for passing data from tasks
+ * to the job committer
+ * {@link org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest},
+ * and for a {@code _SUCCESS} file, which is in
+ * {@link org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData}.
+ * The {@code _SUCCESS} file is a copy of the S3A Committer
+ * {@code org.apache.hadoop.fs.s3a.commit.files.ManifestSuccessData},
+ * the intent being that at the JSON-level they are compatible.
+ * This is to aid testing/validation and support calls, with one single
+ * format to load.
+ *
+ * Consult the individual formats for their declarations of access;
+ * the _SUCCESS file is one which tests may use.
+ *
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.files;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;

+ 94 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/AuditingIntegration.java

@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.fs.audit.CommonAuditContext;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConfig;
+
+import static org.apache.hadoop.fs.audit.AuditConstants.PARAM_JOB_ID;
+import static org.apache.hadoop.fs.audit.CommonAuditContext.currentAuditContext;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.CONTEXT_ATTR_STAGE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.CONTEXT_ATTR_TASK_ATTEMPT_ID;
+
+/**
+ * Helper class to support integration with Hadoop 3.3.2+ Auditing.
+ * This MUST BE the sole place where fs.audit methods are used, so can be replaced
+ * by a stub class on any backport.
+ */
+@InterfaceAudience.Private
+public final class AuditingIntegration {
+  private AuditingIntegration() {
+  }
+
+  /**
+   * Add jobID to current context; also
+   * task attempt ID if set.
+   */
+  public static void updateCommonContextOnCommitterEntry(
+      ManifestCommitterConfig committerConfig) {
+    CommonAuditContext context = currentAuditContext();
+    context.put(PARAM_JOB_ID,
+        committerConfig.getJobUniqueId());
+    // maybe the task attempt ID.
+    if (!committerConfig.getTaskAttemptId().isEmpty()) {
+      context.put(CONTEXT_ATTR_TASK_ATTEMPT_ID,
+          committerConfig.getTaskAttemptId());
+    }
+  }
+
+  /**
+   * Callback on stage entry.
+   * Sets the activeStage and updates the
+   * common context.
+   * @param stage new stage
+   */
+  public static void enterStage(String stage) {
+    currentAuditContext().put(CONTEXT_ATTR_STAGE, stage);
+  }
+
+  /**
+   * Remove stage from common audit context.
+   */
+  public static void exitStage() {
+    currentAuditContext().remove(CONTEXT_ATTR_STAGE);
+  }
+
+  /**
+   * Remove commit info at the end of the task or job.
+   */
+  public static void updateCommonContextOnCommitterExit() {
+    currentAuditContext().remove(PARAM_JOB_ID);
+    currentAuditContext().remove(CONTEXT_ATTR_TASK_ATTEMPT_ID);
+  }
+
+  /**
+   * Update the thread context with the stage name and
+   * job ID.
+   * This MUST be invoked at the start of methods invoked in helper threads,
+   * to ensure that they are all annotated with job and stage.
+   * @param jobId job ID.
+   * @param stage stage name.
+   */
+  public static void enterStageWorker(String jobId, String stage) {
+    CommonAuditContext context = currentAuditContext();
+    context.put(PARAM_JOB_ID, jobId);
+    context.put(CONTEXT_ATTR_STAGE, stage);
+  }
+}

+ 130 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/InternalConstants.java

@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
+
+import java.util.Set;
+
+import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableSet;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OBJECT_CONTINUE_LIST_REQUEST;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OBJECT_LIST_REQUEST;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_GET_FILE_STATUS;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_IS_DIRECTORY;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_IS_FILE;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_LIST_STATUS;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_MKDIRS;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.STORE_IO_RATE_LIMITED;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.*;
+
+/**
+ * Constants internal to the manifest committer.
+ */
+@InterfaceAudience.Private
+public final class InternalConstants {
+  private InternalConstants() {
+  }
+
+  /**
+   * Durations.
+   */
+  public static final String[] DURATION_STATISTICS = {
+
+      /* Job stages. */
+      OP_STAGE_JOB_ABORT,
+      OP_STAGE_JOB_CLEANUP,
+      OP_STAGE_JOB_COMMIT,
+      OP_STAGE_JOB_CREATE_TARGET_DIRS,
+      OP_STAGE_JOB_LOAD_MANIFESTS,
+      OP_STAGE_JOB_RENAME_FILES,
+      OP_STAGE_JOB_SAVE_SUCCESS,
+      OP_STAGE_JOB_SETUP,
+      OP_STAGE_JOB_VALIDATE_OUTPUT,
+
+      /* Task stages. */
+
+      OP_STAGE_TASK_ABORT_TASK,
+      OP_STAGE_TASK_COMMIT,
+      OP_STAGE_TASK_SAVE_MANIFEST,
+      OP_STAGE_TASK_SCAN_DIRECTORY,
+      OP_STAGE_TASK_SETUP,
+
+      /* Lower level store/fs operations. */
+      OP_COMMIT_FILE_RENAME,
+      OP_CREATE_DIRECTORIES,
+      OP_CREATE_ONE_DIRECTORY,
+      OP_DIRECTORY_SCAN,
+      OP_DELETE,
+      OP_DELETE_FILE_UNDER_DESTINATION,
+      OP_GET_FILE_STATUS,
+      OP_IS_DIRECTORY,
+      OP_IS_FILE,
+      OP_LIST_STATUS,
+      OP_LOAD_MANIFEST,
+      OP_LOAD_ALL_MANIFESTS,
+      OP_MKDIRS,
+      OP_MKDIRS_RETURNED_FALSE,
+      OP_MSYNC,
+      OP_PREPARE_DIR_ANCESTORS,
+      OP_RENAME_FILE,
+      OP_SAVE_TASK_MANIFEST,
+
+      OBJECT_LIST_REQUEST,
+      OBJECT_CONTINUE_LIST_REQUEST,
+
+      STORE_IO_RATE_LIMITED
+  };
+
+  /**
+   * Counters.
+   */
+  public static final String[] COUNTER_STATISTICS = {
+      COMMITTER_BYTES_COMMITTED_COUNT,
+      COMMITTER_FILES_COMMITTED_COUNT,
+      COMMITTER_TASKS_COMPLETED_COUNT,
+      COMMITTER_TASKS_FAILED_COUNT,
+      COMMITTER_TASK_DIRECTORY_COUNT_MEAN,
+      COMMITTER_TASK_DIRECTORY_DEPTH_MEAN,
+      COMMITTER_TASK_FILE_COUNT_MEAN,
+      COMMITTER_TASK_FILE_SIZE_MEAN,
+      COMMITTER_TASK_MANIFEST_FILE_SIZE,
+      OP_COMMIT_FILE_RENAME_RECOVERED,
+  };
+
+  /**
+   * Error string from ABFS connector on timeout.
+   */
+  public static final String OPERATION_TIMED_OUT = "OperationTimedOut";
+
+  /**
+   * Format string for task attempt names.
+   */
+  public static final String NAME_FORMAT_TASK_ATTEMPT = "[Task-Attempt %s]";
+
+  /**
+   * Format string for job attempt names.
+   */
+  public static final String NAME_FORMAT_JOB_ATTEMPT = "[Job-Attempt %s]";
+
+  /** Schemas of filesystems we know to not work with this committer. */
+  public static final Set<String> UNSUPPORTED_FS_SCHEMAS =
+      ImmutableSet.of("s3a", "wasb");
+}

+ 374 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestCommitterSupport.java

@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
+
+import java.io.IOException;
+import java.time.ZonedDateTime;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.EtagSource;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.statistics.IOStatisticsAggregator;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStoreBuilder;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.mapreduce.MRJobConfig;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.UserGroupInformation;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.iostatisticsStore;
+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.PENDING_DIR_NAME;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.INITIAL_APP_ATTEMPT_ID;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_ATTEMPT_DIR_FORMAT_STR;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_DIR_FORMAT_STR;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_ID_SOURCE_MAPREDUCE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_TASK_ATTEMPT_SUBDIR;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_TASK_MANIFEST_SUBDIR;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_CLASSNAME;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_SUFFIX;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_STORE_OPERATIONS_CLASS;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SPARK_WRITE_UUID;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUMMARY_FILENAME_FORMAT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.TMP_SUFFIX;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.PRINCIPAL;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.STAGE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.COUNTER_STATISTICS;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.DURATION_STATISTICS;
+
+/**
+ * Class for manifest committer support util methods.
+ */
+
+@InterfaceAudience.Private
+public final class ManifestCommitterSupport {
+
+  private ManifestCommitterSupport() {
+  }
+
+  /**
+   * Create an IOStatistics Store with the standard statistics
+   * set up.
+   * @return a store builder preconfigured with the standard stats.
+   */
+  public static IOStatisticsStoreBuilder createIOStatisticsStore() {
+
+    final IOStatisticsStoreBuilder store
+        = iostatisticsStore();
+
+    store.withCounters(COUNTER_STATISTICS);
+    store.withMaximums(COUNTER_STATISTICS);
+    store.withMinimums(COUNTER_STATISTICS);
+    store.withMeanStatistics(COUNTER_STATISTICS);
+    store.withDurationTracking(DURATION_STATISTICS);
+    return store;
+  }
+
+  /**
+   * If the object is an IOStatisticsSource, get and add
+   * its IOStatistics.
+   * @param o source object.
+   */
+  public static void maybeAddIOStatistics(IOStatisticsAggregator ios,
+      Object o) {
+    if (o instanceof IOStatisticsSource) {
+      ios.aggregate(((IOStatisticsSource) o).getIOStatistics());
+    }
+  }
+
+  /**
+   * Build a Job UUID from the job conf (if it is
+   * {@link ManifestCommitterConstants#SPARK_WRITE_UUID}
+   * or the MR job ID.
+   * @param conf job/task configuration
+   * @param jobId job ID from YARN or spark.
+   * @return (a job ID, source)
+   */
+  public static Pair<String, String> buildJobUUID(Configuration conf,
+      JobID jobId) {
+    String jobUUID = conf.getTrimmed(SPARK_WRITE_UUID, "");
+    if (jobUUID.isEmpty()) {
+      jobUUID = jobId.toString();
+      return Pair.of(jobUUID, JOB_ID_SOURCE_MAPREDUCE);
+    } else {
+      return Pair.of(jobUUID, SPARK_WRITE_UUID);
+    }
+  }
+
+  /**
+   * Get the location of pending job attempts.
+   * @param out the base output directory.
+   * @return the location of pending job attempts.
+   */
+  public static Path getPendingJobAttemptsPath(Path out) {
+    return new Path(out, PENDING_DIR_NAME);
+  }
+
+  /**
+   * Get the Application Attempt Id for this job.
+   * @param context the context to look in
+   * @return the Application Attempt Id for a given job.
+   */
+  public static int getAppAttemptId(JobContext context) {
+    return getAppAttemptId(context.getConfiguration());
+  }
+
+  /**
+   * Get the Application Attempt Id for this job
+   * by looking for {@link MRJobConfig#APPLICATION_ATTEMPT_ID}
+   * in the configuration, falling back to 0 if unset.
+   * For spark it will always be 0, for MR it will be set in the AM
+   * to the {@code ApplicationAttemptId} the AM is launched with.
+   * @param conf job configuration.
+   * @return the Application Attempt Id for the job.
+   */
+  public static int getAppAttemptId(Configuration conf) {
+    return conf.getInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
+        INITIAL_APP_ATTEMPT_ID);
+  }
+
+  /**
+   * Get the path in the job attempt dir for a manifest for a task.
+   * @param manifestDir manifest directory
+   * @param taskId taskID.
+   * @return the final path to rename the manifest file to
+   */
+  public static Path manifestPathForTask(Path manifestDir, String taskId) {
+
+    return new Path(manifestDir, taskId + MANIFEST_SUFFIX);
+  }
+
+  /**
+   * Get the path in the  manifest subdir for the temp path to save a
+   * task attempt's manifest before renaming it to the
+   * path defined by {@link #manifestPathForTask(Path, String)}.
+   * @param manifestDir manifest directory
+   * @param taskAttemptId task attempt ID.
+   * @return the path to save/load the manifest.
+   */
+  public static Path manifestTempPathForTaskAttempt(Path manifestDir,
+      String taskAttemptId) {
+    return new Path(manifestDir,
+        taskAttemptId + MANIFEST_SUFFIX + TMP_SUFFIX);
+  }
+
+  /**
+   * Create a task attempt dir; stage config must be for a task attempt.
+   * @param stageConfig state config.
+   * @return a manifest with job and task attempt info set up.
+   */
+  public static TaskManifest createTaskManifest(StageConfig stageConfig) {
+    final TaskManifest manifest = new TaskManifest();
+    manifest.setTaskAttemptID(stageConfig.getTaskAttemptId());
+    manifest.setTaskID(stageConfig.getTaskId());
+    manifest.setJobId(stageConfig.getJobId());
+    manifest.setJobAttemptNumber(stageConfig.getJobAttemptNumber());
+    manifest.setTaskAttemptDir(
+        stageConfig.getTaskAttemptDir().toUri().toString());
+    return manifest;
+  }
+
+  /**
+   * Create success/outcome data.
+   * @param stageConfig configuration.
+   * @param stage
+   * @return a _SUCCESS object with some diagnostics.
+   */
+  public static ManifestSuccessData createManifestOutcome(
+      StageConfig stageConfig, String stage) {
+    final ManifestSuccessData outcome = new ManifestSuccessData();
+    outcome.setJobId(stageConfig.getJobId());
+    outcome.setJobIdSource(stageConfig.getJobIdSource());
+    outcome.setCommitter(MANIFEST_COMMITTER_CLASSNAME);
+    // real timestamp
+    outcome.setTimestamp(System.currentTimeMillis());
+    final ZonedDateTime now = ZonedDateTime.now();
+    outcome.setDate(now.toString());
+    outcome.setHostname(NetUtils.getLocalHostname());
+    // add some extra diagnostics which can still be parsed by older
+    // builds of test applications.
+    // Audit Span information can go in here too, in future.
+    try {
+      outcome.putDiagnostic(PRINCIPAL,
+          UserGroupInformation.getCurrentUser().getShortUserName());
+    } catch (IOException ignored) {
+      // don't know who we are? exclude from the diagnostics.
+    }
+    outcome.putDiagnostic(STAGE, stage);
+    return outcome;
+  }
+
+  /**
+   * Create the filename for a report from the jobID.
+   * @param jobId jobId
+   * @return filename for a report.
+   */
+  public static String createJobSummaryFilename(String jobId) {
+    return String.format(SUMMARY_FILENAME_FORMAT, jobId);
+  }
+
+  /**
+   * Get an etag from a FileStatus which MUST BE
+   * an implementation of EtagSource and
+   * whose etag MUST NOT BE null/empty.
+   * @param status the status; may be null.
+   * @return the etag or null if not provided
+   */
+  public static String getEtag(FileStatus status) {
+    if (status instanceof EtagSource) {
+      return ((EtagSource) status).getEtag();
+    } else {
+      return null;
+    }
+  }
+
+  /**
+   * Create the manifest store operations for the given FS.
+   * This supports binding to custom filesystem handlers.
+   * @param conf configuration.
+   * @param filesystem fs.
+   * @param path path under FS.
+   * @return a bonded store operations.
+   * @throws IOException on binding/init problems.
+   */
+  public static ManifestStoreOperations createManifestStoreOperations(
+      final Configuration conf,
+      final FileSystem filesystem,
+      final Path path) throws IOException {
+    try {
+      final Class<? extends ManifestStoreOperations> storeClass = conf.getClass(
+          OPT_STORE_OPERATIONS_CLASS,
+          ManifestStoreOperationsThroughFileSystem.class,
+          ManifestStoreOperations.class);
+      final ManifestStoreOperations operations = storeClass.
+          getDeclaredConstructor().newInstance();
+      operations.bindToFileSystem(filesystem, path);
+      return operations;
+    } catch (Exception e) {
+      throw new PathIOException(path.toString(),
+          "Failed to create Store Operations from configuration option "
+              + OPT_STORE_OPERATIONS_CLASS
+              + ":" + e, e);
+    }
+  }
+
+  /**
+   * Logic to create directory names from job and attempt.
+   * This is self-contained it so it can be used in tests
+   * as well as in the committer.
+   */
+  public static class AttemptDirectories {
+
+    /**
+     * Job output path.
+     */
+    private final Path outputPath;
+
+    /**
+     * Path for the job attempt.
+     */
+    private final Path jobAttemptDir;
+
+    /**
+     * Path for the job.
+     */
+    private final Path jobPath;
+
+    /**
+     * Subdir under the job attempt dir where task
+     * attempts will have subdirectories.
+     */
+    private final Path jobAttemptTaskSubDir;
+
+    /**
+     * temp directory under job dest dir.
+     */
+    private final Path outputTempSubDir;
+
+    /**
+     * Directory to save manifests into.
+     */
+    private final Path taskManifestDir;
+
+    /**
+     * Build the attempt directories.
+     * @param outputPath output path
+     * @param jobUniqueId job ID/UUID
+     * @param jobAttemptNumber job attempt number
+     */
+    public AttemptDirectories(
+        Path outputPath,
+        String jobUniqueId,
+        int jobAttemptNumber) {
+      this.outputPath = requireNonNull(outputPath, "Output path");
+
+      this.outputTempSubDir = new Path(outputPath, PENDING_DIR_NAME);
+      // build the path for the job
+      this.jobPath = new Path(outputTempSubDir,
+          String.format(JOB_DIR_FORMAT_STR, jobUniqueId));
+
+      // then the specific path underneath that for the attempt.
+      this.jobAttemptDir = new Path(jobPath,
+          String.format(JOB_ATTEMPT_DIR_FORMAT_STR, jobAttemptNumber));
+
+      // subdir for task attempts.
+      this.jobAttemptTaskSubDir = new Path(jobAttemptDir, JOB_TASK_ATTEMPT_SUBDIR);
+
+      this.taskManifestDir = new Path(jobAttemptDir, JOB_TASK_MANIFEST_SUBDIR);
+    }
+
+    public Path getOutputPath() {
+      return outputPath;
+    }
+
+    public Path getJobAttemptDir() {
+      return jobAttemptDir;
+    }
+
+    public Path getJobPath() {
+      return jobPath;
+    }
+
+    public Path getJobAttemptTaskSubDir() {
+      return jobAttemptTaskSubDir;
+    }
+
+    public Path getTaskAttemptPath(String taskAttemptId) {
+      return new Path(jobAttemptTaskSubDir, taskAttemptId);
+    }
+
+    public Path getOutputTempSubDir() {
+      return outputTempSubDir;
+    }
+
+    public Path getTaskManifestDir() {
+      return taskManifestDir;
+    }
+  }
+}

+ 291 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperations.java

@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
+
+import javax.annotation.Nullable;
+import java.io.Closeable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.time.Duration;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.util.JsonSerialization;
+
+/**
+ * FileSystem operations which are needed to generate the task manifest.
+ * The specific choice of which implementation to use is configurable.
+ * Object store implementations MAY subclass if they
+ * need to implement resilient commit operations.
+ * However, the actual API MUST NOT be used outside
+ * the manifest committer and its tests.
+ */
+@InterfaceAudience.LimitedPrivate("mapreduce, object-stores")
+@InterfaceStability.Unstable
+public abstract class ManifestStoreOperations implements Closeable {
+
+  /**
+   * Bind to the filesystem.
+   * This is called by the manifest committer after the operations
+   * have been instantiated.
+   * @param fileSystem target FS
+   * @param path actual path under FS.
+   * @throws IOException if there are binding problems.
+   */
+  public void bindToFileSystem(FileSystem fileSystem, Path path) throws IOException {
+
+  }
+
+  /**
+   * Forward to {@link FileSystem#getFileStatus(Path)}.
+   * @param path path
+   * @return status
+   * @throws IOException failure.
+   */
+  public abstract FileStatus getFileStatus(Path path) throws IOException;
+
+  /**
+   * Is a path a file? Used during directory creation.
+   * The is a copy & paste of FileSystem.isFile();
+   * {@code StoreOperationsThroughFileSystem} calls into
+   * the FS direct so that stores which optimize their probes
+   * can save on IO.
+   * @param path path to probe
+   * @return true if the path exists and resolves to a file
+   * @throws IOException failure other than FileNotFoundException
+   */
+  public boolean isFile(Path path) throws IOException {
+    try {
+      return getFileStatus(path).isFile();
+    } catch (FileNotFoundException e) {
+      return false;
+    }
+  }
+
+  /**
+   * Forward to {@link FileSystem#delete(Path, boolean)}.
+   * If it returns without an error: there is nothing at
+   * the end of the path.
+   * @param path path
+   * @param recursive recursive delete.
+   * @return true if the path was deleted.
+   * @throws IOException failure.
+   */
+  public abstract boolean delete(Path path, boolean recursive)
+      throws IOException;
+
+  /**
+   * Forward to {@link FileSystem#mkdirs(Path)}.
+   * Usual "what does 'false' mean" ambiguity.
+   * @param path path
+   * @return true if the directory was created.
+   * @throws IOException failure.
+   */
+  public abstract boolean mkdirs(Path path) throws IOException;
+
+  /**
+   * Forward to {@link FileSystem#rename(Path, Path)}.
+   * Usual "what does 'false' mean" ambiguity.
+   * @param source source file
+   * @param dest destination path -which must not exist.
+   * @return the return value of the rename
+   * @throws IOException failure.
+   */
+  public abstract boolean renameFile(Path source, Path dest)
+      throws IOException;
+
+  /**
+   * Rename a dir; defaults to invoking
+   * Forward to {@link #renameFile(Path, Path)}.
+   * Usual "what does 'false' mean?" ambiguity.
+   * @param source source file
+   * @param dest destination path -which must not exist.
+   * @return true if the directory was created.
+   * @throws IOException failure.
+   */
+  public boolean renameDir(Path source, Path dest)
+      throws IOException {
+    return renameFile(source, dest);
+  }
+
+  /**
+   * List the directory.
+   * @param path path to list.
+   * @return an iterator over the results.
+   * @throws IOException any immediate failure.
+   */
+  public abstract RemoteIterator<FileStatus> listStatusIterator(Path path)
+      throws IOException;
+
+  /**
+   * Load a task manifest from the store.
+   * with a real FS, this is done with
+   * {@link TaskManifest#load(JsonSerialization, FileSystem, Path, FileStatus)}
+   *
+   * @param serializer serializer.
+   * @param st status with the path and other data.
+   * @return the manifest
+   * @throws IOException failure to load/parse
+   */
+  public abstract TaskManifest loadTaskManifest(
+      JsonSerialization<TaskManifest> serializer,
+      FileStatus st) throws IOException;
+
+  /**
+   * Save a task manifest by {@code FileSystem.create(path)}.
+   * there's no attempt at renaming anything here.
+   * @param manifestData the manifest/success file
+   * @param path temp path for the initial save
+   * @param overwrite should create(overwrite=true) be used?
+   * @throws IOException failure to load/parse
+   */
+  public abstract <T extends AbstractManifestData<T>> void save(
+      T manifestData,
+      Path path,
+      boolean overwrite) throws IOException;
+
+  /**
+   * Make an msync() call; swallow when unsupported.
+   * @param path path
+   * @throws IOException IO failure
+   */
+  public void msync(Path path) throws IOException {
+
+  }
+
+
+  /**
+   * Extract an etag from a status if the conditions are met.
+   * If the conditions are not met, return null or ""; they will
+   * both be treated as "no etags available"
+   * <pre>
+   *   1. The status is of a type which the implementation recognizes
+   *   as containing an etag.
+   *   2. After casting the etag field can be retrieved
+   *   3. and that value is non-null/non-empty.
+   * </pre>
+   * @param status status, which may be null of any subclass of FileStatus.
+   * @return either a valid etag, or null or "".
+   */
+  public String getEtag(FileStatus status) {
+    return ManifestCommitterSupport.getEtag(status);
+  }
+
+  /**
+   * Does the store preserve etags through renames.
+   * If true, and if the source listing entry has an etag,
+   * it will be used to attempt to validate a failed rename.
+   * @param path path to probe.
+   * @return true if etag comparison is a valid strategy.
+   */
+  public boolean storePreservesEtagsThroughRenames(Path path) {
+    return false;
+  }
+
+  /**
+   * Does the store provide rename resilience through an
+   * implementation of {@link #commitFile(FileEntry)}?
+   * If true then that method will be invoked to commit work
+   * @return true if resilient commit support is available.
+   */
+  public boolean storeSupportsResilientCommit() {
+    return false;
+  }
+
+  /**
+   * Commit one file through any resilient API.
+   * This operation MUST rename source to destination,
+   * else raise an exception.
+   * The result indicates whether or not some
+   * form of recovery took place.
+   *
+   * If etags were collected during task commit, these will be
+   * in the entries passed in here.
+   *
+   * The base implementation always raises
+   * {@code UnsupportedOperationException}
+   * @param entry entry to commit
+   * @return the result of the commit
+   * @throws IOException failure.
+   * @throws UnsupportedOperationException if not available.
+   *
+   */
+  public CommitFileResult commitFile(FileEntry entry) throws IOException {
+    throw new UnsupportedOperationException("Resilient commit not supported");
+  }
+
+  /**
+   * Outcome from the operation {@link #commitFile(FileEntry)}.
+   * As a rename failure MUST raise an exception, this result
+   * only declares whether or not some form of recovery took place.
+   */
+  public static final class CommitFileResult {
+
+    /** Did recovery take place? */
+    private final boolean recovered;
+
+    /** Time waiting for IO capacity, may be null. */
+    @Nullable
+    private final Duration waitTime;
+
+    /**
+     * Full commit result.
+     * @param recovered Did recovery take place?
+     * @param waitTime any time spent waiting for IO capacity.
+     */
+    public static CommitFileResult fromResilientCommit(
+        final boolean recovered,
+        final Duration waitTime) {
+      return new CommitFileResult(recovered, waitTime);
+    }
+
+    /**
+     * Full commit result.
+     * @param recovered Did recovery take place?
+     * @param waitTime any time spent waiting for IO capacity.
+     */
+    public CommitFileResult(final boolean recovered,
+        @Nullable final Duration waitTime) {
+
+      this.recovered = recovered;
+      this.waitTime = waitTime;
+    }
+
+    /**
+     * Did some form of recovery take place?
+     * @return true if the commit succeeded through some form of (etag-based) recovery
+     */
+    public boolean recovered() {
+      return recovered;
+    }
+
+    @Nullable
+    public Duration getWaitTime() {
+      return waitTime;
+    }
+  }
+
+}

+ 187 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java

@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.CommonPathCapabilities;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.util.JsonSerialization;
+
+/**
+ * Implementation of manifest store operations through the filesystem API.
+ * This class is subclassed in the ABFS module, which does add the resilient
+ * commit method.
+ */
+@InterfaceAudience.LimitedPrivate("mapreduce, object-stores")
+@InterfaceStability.Unstable
+public class ManifestStoreOperationsThroughFileSystem extends ManifestStoreOperations {
+
+  /**
+   * Filesystem; set in {@link #bindToFileSystem(FileSystem, Path)}.
+   */
+  private FileSystem fileSystem;
+
+  /**
+   * Has a call to FileSystem.msync() failed as unsupported?
+   * If so, no new attempts will be made when
+   * (@link {@link #msync(Path)} is invoked.
+   */
+  private boolean msyncUnsupported = false;
+
+  /**
+   * Direct Constructor.
+   * @param fileSystem filesystem to write through.
+   */
+  public ManifestStoreOperationsThroughFileSystem(final FileSystem fileSystem) {
+    this.fileSystem = fileSystem;
+  }
+
+  /**
+   * Constructor used for introspection-based binding.
+   */
+  public ManifestStoreOperationsThroughFileSystem() {
+  }
+
+  @Override
+  public void close() throws IOException {
+    /* no-op; FS is assumed to be shared. */
+
+  }
+
+  /**
+   * Get the filesystem.
+   * @return the filesystem; null until bound.
+   */
+  public FileSystem getFileSystem() {
+    return fileSystem;
+  }
+
+  @Override
+  public void bindToFileSystem(FileSystem filesystem, Path path) throws IOException {
+    fileSystem = filesystem;
+  }
+
+  @Override
+  public FileStatus getFileStatus(Path path) throws IOException {
+    return fileSystem.getFileStatus(path);
+  }
+
+  /**
+   * Using FileSystem.isFile to offer stores the option to optimize their probes.
+   * @param path path to probe
+   * @return true if the path resolves to a file.
+   * @throws IOException IO failure.
+   */
+  @SuppressWarnings("deprecation")
+  @Override
+  public boolean isFile(Path path) throws IOException {
+    return fileSystem.isFile(path);
+  }
+
+  @Override
+  public boolean delete(Path path, boolean recursive)
+      throws IOException {
+    return fileSystem.delete(path, recursive);
+  }
+
+  @Override
+  public boolean mkdirs(Path path)
+      throws IOException {
+    return fileSystem.mkdirs(path);
+  }
+
+  @Override
+  public boolean renameFile(Path source, Path dest)
+      throws IOException {
+    return fileSystem.rename(source, dest);
+  }
+
+  @Override
+  public RemoteIterator<FileStatus> listStatusIterator(Path path)
+      throws IOException {
+    return fileSystem.listStatusIterator(path);
+  }
+
+  @Override
+  public TaskManifest loadTaskManifest(
+      JsonSerialization<TaskManifest> serializer,
+      FileStatus st) throws IOException {
+    return TaskManifest.load(serializer, fileSystem, st.getPath(), st);
+  }
+
+  @Override
+  public <T extends AbstractManifestData<T>> void save(
+      final T manifestData,
+      final Path path,
+      final boolean overwrite) throws IOException {
+    manifestData.save(fileSystem, path, overwrite);
+  }
+
+  /**
+   * Probe filesystem capabilities.
+   * @param path path to probe.
+   * @return true if the FS declares its renames work.
+   */
+  @Override
+  public boolean storePreservesEtagsThroughRenames(Path path) {
+    try {
+      return fileSystem.hasPathCapability(path,
+          CommonPathCapabilities.ETAGS_PRESERVED_IN_RENAME);
+    } catch (IOException ignored) {
+      return false;
+    }
+  }
+
+  /**
+   * Invokes FileSystem msync(); swallows UnsupportedOperationExceptions.
+   * This ensures client metadata caches are in sync in an HDFS-HA deployment.
+   * No other filesystems support this; in the absence of a hasPathCapability()
+   * probe, after the operation is rejected, an atomic boolean is set
+   * to stop further attempts from even trying.
+   * @param path path
+   * @throws IOException failure to synchronize.
+   */
+  @Override
+  public void msync(Path path) throws IOException {
+    // there's need for atomicity here, as the sole cost of
+    // multiple failures
+    if (msyncUnsupported) {
+      return;
+    }
+    // qualify so we can be confident that the FS being synced
+    // is the one we expect.
+    fileSystem.makeQualified(path);
+    try {
+      fileSystem.msync();
+    } catch (UnsupportedOperationException ignored) {
+      // this exception is the default.
+      // set the unsupported flag so no future attempts are made.
+      msyncUnsupported = true;
+    }
+  }
+
+}

+ 40 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/OutputValidationException.java

@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+
+/**
+ * Exception raised during validation.
+ * This can be treated differently from other outcomes.
+ */
+@InterfaceAudience.Private
+public class OutputValidationException extends PathIOException {
+  public OutputValidationException(Path path, String error) {
+    super(path.toUri().toString(), error);
+  }
+
+  public OutputValidationException(Path path,
+      String error,
+      Throwable cause) {
+    super(path.toUri().toString(), error, cause);
+  }
+}

+ 30 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/package-info.java

@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Implementation classes for the manifest committer.
+ * Nothing outside this module should be using these classes,
+ * except where explicitly stated.
+ */
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;

+ 34 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/package-info.java

@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Intermediate manifest committer.
+ *
+ * Optimized for object stores where listing is slow, directory renames may not
+ * be atomic, and the output is a deep tree of files intermixed with
+ * the output of (many) other task attempts.
+ *
+ * All classes in this module are private/unstable, except where stated.
+ */
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;

+ 63 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbortTaskStage.java

@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.Path;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_ABORT_TASK;
+
+/**
+ * Abort a task.
+ *
+ * This is done by deleting the task directory.
+ * Exceptions may/may not be suppressed.
+ */
+public class AbortTaskStage extends
+    AbstractJobOrTaskStage<Boolean, Path> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      AbortTaskStage.class);
+
+  public AbortTaskStage(final StageConfig stageConfig) {
+    super(true, stageConfig, OP_STAGE_TASK_ABORT_TASK, false);
+  }
+
+  /**
+   * Delete the task attempt directory.
+   * @param suppressExceptions should exceptions be ignored?
+   * @return the directory
+   * @throws IOException failure when exceptions were not suppressed
+   */
+  @Override
+  protected Path executeStage(final Boolean suppressExceptions)
+      throws IOException {
+    final Path dir = getTaskAttemptDir();
+    if (dir != null) {
+      LOG.info("{}: Deleting task attempt directory {}", getName(), dir);
+      deleteDir(dir, suppressExceptions);
+    }
+    return dir;
+  }
+
+}

+ 942 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbstractJobOrTaskStage.java

@@ -0,0 +1,942 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.FileAlreadyExistsException;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.statistics.DurationTracker;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
+import org.apache.hadoop.util.OperationDuration;
+import org.apache.hadoop.util.Preconditions;
+import org.apache.hadoop.util.functional.CallableRaisingIOE;
+import org.apache.hadoop.util.functional.RemoteIterators;
+import org.apache.hadoop.util.functional.TaskPool;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_GET_FILE_STATUS;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_IS_FILE;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_LIST_STATUS;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_MKDIRS;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.STORE_IO_RATE_LIMITED;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.createTracker;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfInvocation;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_SUFFIX;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME_RECOVERED;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_LOAD_MANIFEST;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_MSYNC;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_RENAME_FILE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_SAVE_TASK_MANIFEST;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration.enterStageWorker;
+
+/**
+ * A Stage in Task/Job Commit.
+ * A stage can be executed once only, creating the return value of the
+ * {@link #apply(Object)} method, and, potentially, updating the state of the
+ * store via {@link ManifestStoreOperations}.
+ * IOStatistics will also be updated.
+ * Stages are expected to be combined to form the commit protocol.
+ * @param <IN> Type of arguments to the stage.
+ * @param <OUT> Type of result.
+ */
+public abstract class AbstractJobOrTaskStage<IN, OUT>
+    implements JobOrTaskStage<IN, OUT> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      AbstractJobOrTaskStage.class);
+
+  /**
+   * Error text on rename failure: {@value}.
+   */
+  public static final String FAILED_TO_RENAME_PREFIX = "Failed to ";
+
+  /**
+   * Is this a task stage? If so, toString() includes task
+   * info..
+   */
+  private final boolean isTaskStage;
+
+  /**
+   * Configuration of all the stages in the ongoing committer
+   * operation.
+   */
+  private final StageConfig stageConfig;
+
+  /**
+   * Name of the stage for statistics and logging.
+   */
+  private final String stageStatisticName;
+
+  /**
+   * Callbacks to update store.
+   * This is not made visible to the stages; they must
+   * go through the wrapper classes in this class, which
+   * add statistics and logging.
+   */
+  private final ManifestStoreOperations operations;
+
+  /**
+   * Submitter for doing IO against the store.
+   */
+  private final TaskPool.Submitter ioProcessors;
+
+  /**
+   * Used to stop any re-entrancy of the rename.
+   * This is an execute-once operation.
+   */
+  private final AtomicBoolean executed = new AtomicBoolean(false);
+
+  /**
+   * Tracker of the duration of the execution of the stage.
+   * set after {@link #executeStage(Object)} completes.
+   */
+  private DurationTracker stageExecutionTracker;
+
+  /**
+   * Name for logging.
+   */
+  private final String name;
+
+  /**
+   * Constructor.
+   * @param isTaskStage Is this a task stage?
+   * @param stageConfig stage-independent configuration.
+   * @param stageStatisticName name of the stage for statistics/logging
+   * @param requireIOProcessors are the IO processors required?
+   */
+  protected AbstractJobOrTaskStage(
+      final boolean isTaskStage,
+      final StageConfig stageConfig,
+      final String stageStatisticName,
+      final boolean requireIOProcessors) {
+    this.isTaskStage = isTaskStage;
+    this.stageStatisticName = stageStatisticName;
+    this.stageConfig = stageConfig;
+    requireNonNull(stageConfig.getDestinationDir(), "Destination Directory");
+    requireNonNull(stageConfig.getJobId(), "Job ID");
+    requireNonNull(stageConfig.getJobAttemptDir(), "Job attempt directory");
+    this.operations = requireNonNull(stageConfig.getOperations(),
+        "Operations callbacks");
+    // and the processors of work if required.
+    this.ioProcessors = bindProcessor(
+        requireIOProcessors,
+        stageConfig.getIoProcessors());
+    String stageName;
+    if (isTaskStage) {
+      // force fast failure.
+      getRequiredTaskId();
+      getRequiredTaskAttemptId();
+      getRequiredTaskAttemptDir();
+      stageName = String.format("[Task-Attempt %s]", getRequiredTaskAttemptId());
+    } else  {
+      stageName = String.format("[Job-Attempt %s/%02d]",
+          stageConfig.getJobId(),
+          stageConfig.getJobAttemptNumber());
+    }
+    name = stageName;
+  }
+
+  /**
+   * Bind to the processor if it is required.
+   * @param required is the processor required?
+   * @param processor processor
+   * @return the processor binding
+   * @throws NullPointerException if required == true and processor is null.
+   */
+  private TaskPool.Submitter bindProcessor(
+      final boolean required,
+      final TaskPool.Submitter processor) {
+    return required
+        ? requireNonNull(processor, "required IO processor is null")
+        : null;
+  }
+
+  /**
+   * Stage entry point.
+   * Verifies that this is the first and only time the stage is invoked,
+   * then calls {@link #executeStage(Object)} for the subclass
+   * to perform its part of the commit protocol.
+   * The duration of the stage is collected as a statistic, and its
+   * entry/exit logged at INFO.
+   * @param arguments arguments to the function.
+   * @return the result.
+   * @throws IOException failures.
+   */
+  @Override
+  public final OUT apply(final IN arguments) throws IOException {
+    executeOnlyOnce();
+    progress();
+    String stageName = getStageName(arguments);
+    getStageConfig().enterStage(stageName);
+    String statisticName = getStageStatisticName(arguments);
+    // The tracker here
+    LOG.info("{}: Executing Stage {}", getName(), stageName);
+    stageExecutionTracker = createTracker(getIOStatistics(), statisticName);
+    try {
+      // exec the input function and return its value
+      final OUT out = executeStage(arguments);
+      LOG.info("{}: Stage {} completed after {}",
+          getName(),
+          stageName,
+          OperationDuration.humanTime(
+              stageExecutionTracker.asDuration().toMillis()));
+      return out;
+    } catch (IOException | RuntimeException e) {
+      LOG.error("{}: Stage {} failed: after {}: {}",
+          getName(),
+          stageName,
+          OperationDuration.humanTime(
+              stageExecutionTracker.asDuration().toMillis()),
+          e.toString());
+      LOG.debug("{}: Stage failure:", getName(), e);
+      // input function failed: note it
+      stageExecutionTracker.failed();
+      // and rethrow
+      throw e;
+    } finally {
+      // update the tracker.
+      // this is called after the catch() call would have
+      // set the failed flag.
+      stageExecutionTracker.close();
+      progress();
+      getStageConfig().exitStage(stageName);
+    }
+  }
+
+  /**
+   * The work of a stage.
+   * Executed exactly once.
+   * @param arguments arguments to the function.
+   * @return the result.
+   * @throws IOException failures.
+   */
+  protected abstract OUT executeStage(IN arguments) throws IOException;
+
+  /**
+   * Check that the operation has not been invoked twice.
+   * This is an atomic check.
+   * @throws IllegalStateException on a second invocation.
+   */
+  private void executeOnlyOnce() {
+    Preconditions.checkState(
+        !executed.getAndSet(true),
+        "Stage attempted twice");
+  }
+
+  /**
+   * The stage statistic name.
+   * @param arguments args to the invocation.
+   * @return stage name.
+   */
+  protected String getStageStatisticName(IN arguments) {
+    return stageStatisticName;
+  }
+
+  /**
+   * Stage name for reporting; defaults to
+   * call {@link #getStageStatisticName(IN)}.
+   * @param arguments args to the invocation.
+   * @return name used in updating reports.
+   */
+  protected String getStageName(IN arguments) {
+    return getStageStatisticName(arguments);
+  }
+
+  /**
+   * Get the execution tracker; non-null
+   * after stage execution.
+   * @return a tracker or null.
+   */
+  public DurationTracker getStageExecutionTracker() {
+    return stageExecutionTracker;
+  }
+
+  /**
+   * Adds the duration of the job to an IOStatistics store
+   * (such as the manifest to be saved).
+   * @param iostats store
+   * @param statistic statistic name.
+   */
+  public void addExecutionDurationToStatistics(IOStatisticsStore iostats,
+      String statistic) {
+    iostats.addTimedOperation(
+        statistic,
+        getStageExecutionTracker().asDuration());
+  }
+
+  /**
+   * Note any rate limiting to the given timing statistic.
+   * If the wait was 0, no statistics are updated.
+   * @param statistic statistic key.
+   * @param wait wait duration.
+   */
+  private void noteAnyRateLimiting(String statistic, Duration wait) {
+    if (!wait.isZero()) {
+      // rate limiting took place
+      getIOStatistics().addTimedOperation(
+          statistic,
+          wait.toMillis());
+    }
+  }
+
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder(
+        "AbstractJobOrTaskStage{");
+    sb.append(isTaskStage ? "Task Stage" : "Job Stage");
+    sb.append(" name='").append(name).append('\'');
+    sb.append(" stage='").append(stageStatisticName).append('\'');
+    sb.append('}');
+    return sb.toString();
+  }
+
+  /**
+   * The stage configuration.
+   * @return the stage configuration used by this stage.
+   */
+  protected StageConfig getStageConfig() {
+    return stageConfig;
+  }
+
+  /**
+   * Update the thread context with the stage name and
+   * job ID.
+   * This MUST be invoked at the start of methods invoked in helper threads,
+   * to ensure that they are all annotated with job and stage.
+   * @param stage stage name.
+   */
+  protected void updateAuditContext(final String stage) {
+    enterStageWorker(stageConfig.getJobId(), stage);
+  }
+
+  /**
+   * The IOStatistics are shared across all uses of the
+   * StageConfig.
+   * @return the (possibly shared) IOStatistics.
+   */
+  @Override
+  public final IOStatisticsStore getIOStatistics() {
+    return stageConfig.getIOStatistics();
+  }
+
+  /**
+   * Call progress() on any Progressable passed in.
+   */
+  protected final void progress() {
+    if (stageConfig.getProgressable() != null) {
+      stageConfig.getProgressable().progress();
+    }
+  }
+
+  /**
+   * Get a file status value or, if the path doesn't exist, return null.
+   * @param path path
+   * @return status or null
+   * @throws IOException IO Failure.
+   */
+  protected final FileStatus getFileStatusOrNull(
+      final Path path)
+      throws IOException {
+    try {
+      return getFileStatus(path);
+    } catch (FileNotFoundException e) {
+      return null;
+    }
+  }
+
+  /**
+   * Get a file status value or, if the path doesn't exist, return null.
+   * @param path path
+   * @return status or null
+   * @throws IOException IO Failure.
+   */
+  protected final FileStatus getFileStatus(
+      final Path path)
+      throws IOException {
+    LOG.trace("{}: getFileStatus('{}')", getName(), path);
+    requireNonNull(path,
+        () -> String.format("%s: Null path for getFileStatus() call", getName()));
+    return trackDuration(getIOStatistics(), OP_GET_FILE_STATUS, () ->
+        operations.getFileStatus(path));
+  }
+
+  /**
+   * Get a file status value or, if the path doesn't exist, return null.
+   * @param path path
+   * @return true if the path resolves to a file
+   * @throws IOException IO Failure.
+   */
+  protected final boolean isFile(
+      final Path path)
+      throws IOException {
+    LOG.trace("{}: isFile('{}')", getName(), path);
+    return trackDuration(getIOStatistics(), OP_IS_FILE, () -> {
+      return operations.isFile(path);
+    });
+  }
+
+  /**
+   * Delete a path.
+   * @param path path
+   * @param recursive recursive delete.
+   * @return status or null
+   * @throws IOException IO Failure.
+   */
+  protected final boolean delete(
+      final Path path,
+      final boolean recursive)
+      throws IOException {
+    LOG.trace("{}: delete('{}, {}')", getName(), path, recursive);
+    return delete(path, recursive, OP_DELETE);
+  }
+
+  /**
+   * Delete a path.
+   * @param path path
+   * @param recursive recursive delete.
+   * @param statistic statistic to update
+   * @return status or null
+   * @throws IOException IO Failure.
+   */
+  protected Boolean delete(
+      final Path path,
+      final boolean recursive,
+      final String statistic)
+      throws IOException {
+    return trackDuration(getIOStatistics(), statistic, () -> {
+      return operations.delete(path, recursive);
+    });
+  }
+
+  /**
+   * Create a directory.
+   * @param path path
+   * @param escalateFailure escalate "false" to PathIOE
+   * @return true if the directory was created/exists.
+   * @throws IOException IO Failure.
+   */
+  protected final boolean mkdirs(
+      final Path path,
+      final boolean escalateFailure)
+      throws IOException {
+    LOG.trace("{}: mkdirs('{}')", getName(), path);
+    return trackDuration(getIOStatistics(), OP_MKDIRS, () -> {
+      boolean success = operations.mkdirs(path);
+      if (!success && escalateFailure) {
+        throw new PathIOException(path.toUri().toString(),
+            stageStatisticName + ": mkdirs() returned false");
+      }
+      return success;
+    });
+
+  }
+
+  /**
+   * List all directly files under a path.
+   * Async implementations may under-report their durations.
+   * @param path path
+   * @return iterator over the results.
+   * @throws IOException IO Failure.
+   */
+  protected final RemoteIterator<FileStatus> listStatusIterator(
+      final Path path)
+      throws IOException {
+    LOG.trace("{}: listStatusIterator('{}')", getName(), path);
+    return trackDuration(getIOStatistics(), OP_LIST_STATUS, () ->
+        operations.listStatusIterator(path));
+  }
+
+  /**
+   * Load a manifest file.
+   * @param status source.
+   * @return the manifest.
+   * @throws IOException IO Failure.
+   */
+  protected final TaskManifest loadManifest(
+      final FileStatus status)
+      throws IOException {
+    LOG.trace("{}: loadManifest('{}')", getName(), status);
+    return trackDuration(getIOStatistics(), OP_LOAD_MANIFEST, () ->
+        operations.loadTaskManifest(
+            stageConfig.currentManifestSerializer(),
+            status));
+  }
+
+  /**
+   * List all the manifests in the task manifest dir.
+   * @return a iterator of manifests.
+   * @throws IOException IO Failure.
+   */
+  protected final RemoteIterator<FileStatus> listManifests()
+      throws IOException {
+    return RemoteIterators.filteringRemoteIterator(
+        listStatusIterator(getTaskManifestDir()),
+        st -> st.getPath().toUri().toString().endsWith(MANIFEST_SUFFIX));
+  }
+
+  /**
+   * Make an msync() call; swallow when unsupported.
+   * @param path path
+   * @throws IOException IO failure
+   */
+  protected final void msync(Path path) throws IOException {
+    LOG.trace("{}: msync('{}')", getName(), path);
+    trackDurationOfInvocation(getIOStatistics(), OP_MSYNC, () ->
+        operations.msync(path));
+  }
+
+  /**
+   * Create a directory -failing if it exists or if
+   * mkdirs() failed.
+   * @param operation operation for error reporting.
+   * @param path path path to create.
+   * @return the path.
+   * @throws IOException failure
+   * @throws PathIOException mkdirs failed.
+   * @throws FileAlreadyExistsException destination exists.
+   */
+  protected final Path createNewDirectory(
+      final String operation,
+      final Path path) throws IOException {
+    LOG.trace("{}: {} createNewDirectory('{}')", getName(), operation, path);
+    requireNonNull(path,
+        () -> String.format("%s: Null path for operation %s", getName(), operation));
+    // check for dir existence before trying to create.
+    try {
+      final FileStatus status = getFileStatus(path);
+      // no exception, so the path exists.
+      throw new FileAlreadyExistsException(operation
+          + ": path " + path
+          + " already exists and has status " + status);
+    } catch (FileNotFoundException e) {
+      // the path does not exist, so create it.
+      mkdirs(path, true);
+      return path;
+    }
+  }
+
+  /**
+   * Assert that a path is a directory which must exist.
+   * @param operation operation for error reporting.
+   * @param path path path to create.
+   * @return the path
+   * @throws IOException failure
+   * @throws PathIOException mkdirs failed.
+   * @throws FileAlreadyExistsException destination exists.
+   */
+  protected final Path directoryMustExist(
+      final String operation,
+      final Path path) throws IOException {
+    final FileStatus status = getFileStatus(path);
+    if (!status.isDirectory()) {
+      throw new PathIOException(path.toString(),
+          operation
+              + ": Path is not a directory; its status is :" + status);
+    }
+    return path;
+  }
+
+  /**
+   * Save a task manifest or summary. This will be done by
+   * writing to a temp path and then renaming.
+   * If the destination path exists: Delete it.
+   * @param manifestData the manifest/success file
+   * @param tempPath temp path for the initial save
+   * @param finalPath final path for rename.
+   * @throws IOException failure to load/parse
+   */
+  @SuppressWarnings("unchecked")
+  protected final <T extends AbstractManifestData> void save(T manifestData,
+      final Path tempPath,
+      final Path finalPath) throws IOException {
+    LOG.trace("{}: save('{}, {}, {}')", getName(), manifestData, tempPath, finalPath);
+    trackDurationOfInvocation(getIOStatistics(), OP_SAVE_TASK_MANIFEST, () ->
+        operations.save(manifestData, tempPath, true));
+    renameFile(tempPath, finalPath);
+  }
+
+  /**
+   * Get an etag from a FileStatus which MUST BE
+   * an implementation of EtagSource and
+   * whose etag MUST NOT BE null/empty.
+   * @param status the status; may be null.
+   * @return the etag or null if not provided
+   */
+  public String getEtag(FileStatus status) {
+    return operations.getEtag(status);
+  }
+
+  /**
+   * Rename a file from source to dest; if the underlying FS API call
+   * returned false that's escalated to an IOE.
+   * @param source source file.
+   * @param dest dest file
+   * @throws IOException failure
+   * @throws PathIOException if the rename() call returned false.
+   */
+  protected final void renameFile(final Path source, final Path dest)
+      throws IOException {
+    maybeDeleteDest(true, dest);
+    executeRenamingOperation("renameFile", source, dest,
+        OP_RENAME_FILE, () ->
+            operations.renameFile(source, dest));
+  }
+
+  /**
+   * Rename a file from source to dest; if the underlying FS API call
+   * returned false that's escalated to an IOE.
+   * @param source source file.
+   * @param dest dest file
+   * @throws IOException failure
+   * @throws PathIOException if the rename() call returned false.
+   */
+  protected final void renameDir(final Path source, final Path dest)
+      throws IOException {
+
+    maybeDeleteDest(true, dest);
+    executeRenamingOperation("renameDir", source, dest,
+        OP_RENAME_FILE, () ->
+        operations.renameDir(source, dest)
+    );
+  }
+
+  /**
+   * Commit a file from the manifest using rename or, if available, resilient renaming.
+   * @param entry entry from manifest
+   * @throws PathIOException if the rename() call returned false and was uprated.
+   * @throws IOException failure
+   */
+  protected final CommitOutcome commitFile(FileEntry entry,
+      boolean deleteDest)
+      throws IOException {
+
+    final Path source = entry.getSourcePath();
+    final Path dest = entry.getDestPath();
+
+    maybeDeleteDest(deleteDest, dest);
+    if (storeSupportsResilientCommit()) {
+      // get the commit permits
+      final ManifestStoreOperations.CommitFileResult result = trackDuration(getIOStatistics(),
+          OP_COMMIT_FILE_RENAME, () ->
+              operations.commitFile(entry));
+      if (result.recovered()) {
+        // recovery took place.
+        getIOStatistics().incrementCounter(OP_COMMIT_FILE_RENAME_RECOVERED);
+      }
+      if (result.getWaitTime() != null) {
+        // note any delay which took place
+        noteAnyRateLimiting(STORE_IO_RATE_LIMITED, result.getWaitTime());
+      }
+    } else {
+      // commit with a simple rename; failures will be escalated.
+      executeRenamingOperation("renameFile", source, dest,
+          OP_COMMIT_FILE_RENAME, () ->
+              operations.renameFile(source, dest));
+    }
+    return new CommitOutcome();
+  }
+
+  /**
+   * Does this store support resilient commit.
+   * @return true if resilient commit operations are available.
+   */
+  protected boolean storeSupportsResilientCommit() {
+    return operations.storeSupportsResilientCommit();
+  }
+
+  private void maybeDeleteDest(final boolean deleteDest, final Path dest) throws IOException {
+    if (deleteDest) {
+      // delete the destination, always, knowing that it's a no-op if
+      // the data isn't there. Skipping the change saves one round trip
+      // to actually look for the file/object
+      boolean deleted = delete(dest, true);
+      // log the outcome in case of emergency diagnostics traces
+      // being needed.
+      LOG.debug("{}: delete('{}') returned {}'", getName(), dest, deleted);
+    }
+  }
+
+  /**
+   * Execute an operation to rename a file/dir, commit a manifest entry.
+   * The statistic is tracked; returning false from the operation is considered
+   * a failure from the statistics perspective.
+   * @param operation operation name
+   * @param source source path
+   * @param dest dest path
+   * @param statistic statistic to track
+   * @param action callable of the operation
+   * @throws IOException on any failure
+   */
+  private void executeRenamingOperation(String operation,
+      Path source,
+      Path dest,
+      String statistic,
+      CallableRaisingIOE<Boolean> action) throws IOException {
+
+    LOG.debug("{}: {} '{}' to '{}')", getName(), operation, source, dest);
+    requireNonNull(source, "Null source");
+    requireNonNull(dest, "Null dest");
+
+    // duration tracking is a bit convoluted as it
+    // ensures that rename failures as well as IOEs are
+    // treated as failures from a statistics perspective.
+
+    DurationTracker tracker = createTracker(getIOStatistics(), statistic);
+    boolean success;
+    try {
+      success = action.apply();
+      if (!success) {
+        // record failure in the tracker before closing it
+        tracker.failed();
+      }
+    } catch (IOException | RuntimeException e) {
+      LOG.info("{}: {} raised an exception: {}", getName(), operation, e.toString());
+      LOG.debug("{}: {} stack trace", getName(), operation, e);
+      tracker.failed();
+      throw e;
+    } finally {
+      // success
+      // update the tracker.
+      tracker.close();
+    }
+    // escalate the failure; this is done out of the duration tracker
+    // so its file status probes aren't included.
+    if (!success) {
+      throw escalateRenameFailure(operation, source, dest);
+    }
+  }
+
+  /**
+   * Escalate a rename failure to an exception.
+   * Returns an error exception to throw if one was not
+   * triggered when probing for the source.
+   * @param operation operation name
+   * @param source source path
+   * @param dest dest path
+   * @return an exception to throw
+   * @throws IOException raised probing for source or dest
+   */
+  private PathIOException escalateRenameFailure(String operation,
+      Path source, Path dest) throws IOException {
+    // rename just returned false.
+    // collect information for a meaningful error message
+    // and include in an exception raised.
+
+    // get the source status; this will implicitly raise a FNFE.
+    final FileStatus sourceStatus = getFileStatus(source);
+
+    // and look to see if there is anything at the destination
+    final FileStatus destStatus = getFileStatusOrNull(dest);
+
+    LOG.error("{}: failure to {} {} to {} with" +
+            " source status {} " +
+            " and destination status {}",
+        getName(), operation, source, dest,
+        sourceStatus, destStatus);
+
+    return new PathIOException(source.toString(),
+        FAILED_TO_RENAME_PREFIX + operation + " to " + dest);
+  }
+
+  /**
+   * Outcome from the commit.
+   */
+  public static final class CommitOutcome {
+
+  }
+
+  /**
+   * Job ID: never null.
+   */
+  protected final String getJobId() {
+    return stageConfig.getJobId();
+  }
+
+  /**
+   * Job attempt number.
+   */
+  protected final int getJobAttemptNumber() {
+    return stageConfig.getJobAttemptNumber();
+  }
+
+  /**
+   * ID of the task.
+   */
+  protected final String getTaskId() {
+    return stageConfig.getTaskId();
+  }
+
+  /**
+   * Get the task ID; raise an NPE
+   * if it is null.
+   * @return a non-null task ID.
+   */
+  protected final String getRequiredTaskId() {
+    return requireNonNull(getTaskId(),
+        "No Task ID in stage config");
+  }
+
+  /**
+   * ID of this specific attempt at a task.
+   */
+  protected final String getTaskAttemptId() {
+    return stageConfig.getTaskAttemptId();
+  }
+
+  /**
+   * Get the task attempt ID; raise an NPE
+   * if it is null.
+   * @return a non-null task attempt ID.
+   */
+  protected final String getRequiredTaskAttemptId() {
+    return requireNonNull(getTaskAttemptId(),
+        "No Task Attempt ID in stage config");
+  }
+
+  /**
+   * Job attempt dir.
+   */
+  protected final Path getJobAttemptDir() {
+    return stageConfig.getJobAttemptDir();
+  }
+
+  /**
+   * Directory to put task manifests into.
+   * @return a path under the job attempt dir.
+   */
+  protected final Path getTaskManifestDir() {
+    return stageConfig.getTaskManifestDir();
+  }
+
+
+  /**
+   * Task attempt dir.
+   */
+  protected final Path getTaskAttemptDir() {
+    return stageConfig.getTaskAttemptDir();
+  }
+
+  /**
+   * Get the task attemptDir; raise an NPE
+   * if it is null.
+   * @return a non-null task attempt dir.
+   */
+  protected final Path getRequiredTaskAttemptDir() {
+    return requireNonNull(getTaskAttemptDir(),
+        "No Task Attempt Dir");
+  }
+
+  /**
+   * Destination of job.
+   */
+  protected final Path getDestinationDir() {
+    return stageConfig.getDestinationDir();
+  }
+
+  /**
+   * Stage confog name, for logging.
+   * @return name.
+   */
+  public final String getName() {
+    return name;
+  }
+
+  /**
+   * Submitter for doing IO against the store other than
+   * manifest processing.
+   */
+  protected final TaskPool.Submitter getIOProcessors() {
+    return ioProcessors;
+  }
+
+  /**
+   * Submitter for doing IO against the store other than
+   * manifest processing.
+   * The size parameter is used to select between sequential
+   * and parallel runners.
+   * no data, or one entry: serial.
+   * everything else, parallel.
+   * @param size number of items.
+   * @return a submitter or null
+   */
+  protected final TaskPool.Submitter getIOProcessors(int size) {
+    return size > 1
+        ? getIOProcessors()
+        : null;
+  }
+
+  /**
+   * Delete a directory, possibly suppressing exceptions.
+   * @param dir directory.
+   * @param suppressExceptions should exceptions be suppressed?
+   * @throws IOException exceptions raised in delete if not suppressed.
+   * @return any exception caught and suppressed
+   */
+  protected IOException deleteDir(
+      final Path dir,
+      final Boolean suppressExceptions)
+      throws IOException {
+    try {
+      delete(dir, true);
+      return null;
+    } catch (IOException ex) {
+      LOG.info("Error deleting {}: {}", dir, ex.toString());
+      if (!suppressExceptions) {
+        throw ex;
+      } else {
+        return ex;
+      }
+    }
+  }
+
+  /**
+   * Create an entry for a file to rename under the destination.
+   * If the store operations supports extracting etags from file status
+   * entries, that is included in the entry
+   * @param status source file
+   * @param destDir destination directory
+   * @return an entry which includes the rename path
+   */
+  protected FileEntry fileEntry(FileStatus status, Path destDir) {
+    // generate a new path under the dest dir
+    Path dest = new Path(destDir, status.getPath().getName());
+    return new FileEntry(status.getPath(),
+        dest,
+        status.getLen(),
+        getEtag(status));
+  }
+
+}

+ 511 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java

@@ -0,0 +1,511 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.util.DurationInfo;
+import org.apache.hadoop.util.functional.RemoteIterators;
+import org.apache.hadoop.util.functional.TaskPool;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED;
+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED_DEFAULT;
+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED;
+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED_DEFAULT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
+
+/**
+ * Clean up a job's temporary directory through parallel delete,
+ * base _temporary delete.
+ * Returns: the outcome of the overall operation
+ * The result is detailed purely for the benefit of tests, which need
+ * to make assertions about error handling and fallbacks.
+ *
+ * There's a few known issues with the azure and GCS stores which
+ * this stage tries to address.
+ * - Google GCS directory deletion is O(entries), so is slower for big jobs.
+ * - Azure storage directory delete, when using OAuth authentication or
+ *   when not the store owner triggers a scan down the tree to verify the
+ *   caller has the permission to delete each subdir.
+ *   If this scan takes over 90s, the operation can time out.
+ *
+ * The main solution for both of these is that task attempts are
+ * deleted in parallel, in different threads.
+ * This will speed up GCS cleanup and reduce the risk of
+ * abfs related timeouts.
+ * Exceptions during cleanup can be suppressed,
+ * so that these do not cause the job to fail.
+ *
+ * Also, some users want to be able to run multiple independent jobs
+ * targeting the same output directory simultaneously.
+ * If one job deletes the directory `__temporary` all the others
+ * will fail.
+ *
+ * This can be addressed by disabling cleanup entirely.
+ *
+ */
+public class CleanupJobStage extends
+    AbstractJobOrTaskStage<
+            CleanupJobStage.Arguments,
+            CleanupJobStage.Result> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      CleanupJobStage.class);
+
+  /**
+   * Count of deleted directories.
+   */
+  private final AtomicInteger deleteDirCount = new AtomicInteger();
+
+  /**
+   * Count of delete failures.
+   */
+  private final AtomicInteger deleteFailureCount = new AtomicInteger();
+
+  /**
+   * Last delete exception; non null if deleteFailureCount is not zero.
+   */
+  private IOException lastDeleteException;
+
+  /**
+   * Stage name as passed in from arguments.
+   */
+  private String stageName = OP_STAGE_JOB_CLEANUP;
+
+  public CleanupJobStage(final StageConfig stageConfig) {
+    super(false, stageConfig, OP_STAGE_JOB_CLEANUP, true);
+  }
+
+  /**
+   * Statistic name is extracted from the arguments.
+   * @param arguments args to the invocation.
+   * @return stage name.
+   */
+  @Override
+  protected String getStageStatisticName(Arguments arguments) {
+    return arguments.statisticName;
+  }
+
+  /**
+   * Clean up the job attempt directory tree.
+   * @param args arguments built up.
+   * @return the result.
+   * @throws IOException failure was raised an exceptions weren't surpressed.
+   */
+  @Override
+  protected Result executeStage(
+      final Arguments args)
+      throws IOException {
+    stageName = getStageName(args);
+    // this is $dest/_temporary
+    final Path baseDir = requireNonNull(getStageConfig().getOutputTempSubDir());
+    LOG.debug("{}: Cleaup of directory {} with {}", getName(), baseDir, args);
+    if (!args.enabled) {
+      LOG.info("{}: Cleanup of {} disabled", getName(), baseDir);
+      return new Result(Outcome.DISABLED, baseDir,
+          0, null);
+    }
+    // shortcut of a single existence check before anything else
+    if (getFileStatusOrNull(baseDir) == null) {
+      return new Result(Outcome.NOTHING_TO_CLEAN_UP,
+          baseDir,
+          0, null);
+    }
+
+    Outcome outcome = null;
+    IOException exception;
+
+
+    // to delete.
+    LOG.info("{}: Deleting job directory {}", getName(), baseDir);
+
+    if (args.deleteTaskAttemptDirsInParallel) {
+      // Attempt to do a parallel delete of task attempt dirs;
+      // don't overreact if a delete fails, but stop trying
+      // to delete the others, and fall back to deleting the
+      // job dir.
+      Path taskSubDir
+          = getStageConfig().getJobAttemptTaskSubDir();
+      try (DurationInfo info = new DurationInfo(LOG,
+          "parallel deletion of task attempts in %s",
+          taskSubDir)) {
+        RemoteIterator<FileStatus> dirs =
+            RemoteIterators.filteringRemoteIterator(
+                listStatusIterator(taskSubDir),
+                FileStatus::isDirectory);
+        TaskPool.foreach(dirs)
+            .executeWith(getIOProcessors())
+            .stopOnFailure()
+            .suppressExceptions(false)
+            .run(this::rmTaskAttemptDir);
+        getIOStatistics().aggregate((retrieveIOStatistics(dirs)));
+
+        if (getLastDeleteException() != null) {
+          // one of the task attempts failed.
+          throw getLastDeleteException();
+        }
+        // success: record this as the outcome.
+        outcome = Outcome.PARALLEL_DELETE;
+      } catch (FileNotFoundException ex) {
+        // not a problem if there's no dir to list.
+        LOG.debug("{}: Task attempt dir {} not found", getName(), taskSubDir);
+        outcome = Outcome.DELETED;
+      } catch (IOException ex) {
+        // failure. Log and continue
+        LOG.info(
+            "{}: Exception while listing/deleting task attempts under {}; continuing",
+            getName(),
+            taskSubDir, ex);
+        // not overreacting here as the base delete will still get executing
+        outcome = Outcome.DELETED;
+      }
+    }
+    // Now the top-level deletion; exception gets saved
+    exception = deleteOneDir(baseDir);
+    if (exception != null) {
+      // failure, report and continue
+      // assume failure.
+      outcome = Outcome.FAILURE;
+    } else {
+      // if the outcome isn't already recorded as parallel delete,
+      // mark is a simple delete.
+      if (outcome == null) {
+        outcome = Outcome.DELETED;
+      }
+    }
+
+    Result result = new Result(
+        outcome,
+        baseDir,
+        deleteDirCount.get(),
+        exception);
+    if (!result.succeeded() && !args.suppressExceptions) {
+      result.maybeRethrowException();
+    }
+
+    return result;
+  }
+
+  /**
+   * Delete a single TA dir in a parallel task.
+   * Updates the audit context.
+   * Exceptions are swallowed so that attempts are still made
+   * to delete the others, but the first exception
+   * caught is saved in a field which can be retrieved
+   * via {@link #getLastDeleteException()}.
+   *
+   * @param status dir to be deleted.
+   * @throws IOException delete failure.
+   */
+  private void rmTaskAttemptDir(FileStatus status) throws IOException {
+    // stage name in audit context is the one set in the arguments.
+    updateAuditContext(stageName);
+    // update the progress callback in case delete is really slow.
+    progress();
+    deleteOneDir(status.getPath());
+  }
+
+  /**
+   * Delete a directory.
+   * The {@link #deleteFailureCount} counter.
+   * is incremented on every failure.
+   * @param dir directory
+   * @throws IOException if an IOE was raised
+   * @return any IOE raised.
+   */
+  private IOException deleteOneDir(final Path dir)
+      throws IOException {
+
+    deleteDirCount.incrementAndGet();
+    IOException ex = deleteDir(dir, true);
+    if (ex != null) {
+      deleteFailure(ex);
+    }
+    return ex;
+  }
+
+  /**
+   * Note a failure.
+   * @param ex exception
+   */
+  private synchronized void deleteFailure(IOException ex) {
+    // excaption: add the count
+    deleteFailureCount.incrementAndGet();
+    lastDeleteException = ex;
+  }
+
+  /**
+   * Get the last delete exception; synchronized.
+   * @return the last delete exception or null.
+   */
+  public synchronized IOException getLastDeleteException() {
+    return lastDeleteException;
+  }
+
+  /**
+   * Options to pass down to the cleanup stage.
+   */
+  public static final class Arguments {
+
+    /**
+     * Statistic to update.
+     */
+    private final String statisticName;
+
+    /** Delete is enabled? */
+    private final boolean enabled;
+
+    /** Attempt parallel delete of task attempt dirs? */
+    private final boolean deleteTaskAttemptDirsInParallel;
+
+    /** Ignore failures? */
+    private final boolean suppressExceptions;
+
+    /**
+     * Arguments to the stage.
+     * @param statisticName stage name to report
+     * @param enabled is the stage enabled?
+     * @param deleteTaskAttemptDirsInParallel delete task attempt dirs in
+     *        parallel?
+     * @param suppressExceptions suppress exceptions?
+     */
+    public Arguments(
+        final String statisticName,
+        final boolean enabled,
+        final boolean deleteTaskAttemptDirsInParallel,
+        final boolean suppressExceptions) {
+      this.statisticName = statisticName;
+      this.enabled = enabled;
+      this.deleteTaskAttemptDirsInParallel = deleteTaskAttemptDirsInParallel;
+      this.suppressExceptions = suppressExceptions;
+    }
+
+    public String getStatisticName() {
+      return statisticName;
+    }
+
+    public boolean isEnabled() {
+      return enabled;
+    }
+
+    public boolean isDeleteTaskAttemptDirsInParallel() {
+      return deleteTaskAttemptDirsInParallel;
+    }
+
+    public boolean isSuppressExceptions() {
+      return suppressExceptions;
+    }
+
+    @Override
+    public String toString() {
+      return "Arguments{" +
+          "statisticName='" + statisticName + '\''
+          + ", enabled=" + enabled
+          + ", deleteTaskAttemptDirsInParallel="
+          + deleteTaskAttemptDirsInParallel
+          + ", suppressExceptions=" + suppressExceptions
+          + '}';
+    }
+  }
+
+  /**
+   * Static disabled arguments.
+   */
+  public static final Arguments DISABLED = new Arguments(OP_STAGE_JOB_CLEANUP,
+      false,
+      false,
+      false
+  );
+
+  /**
+   * Build an options argument from a configuration, using the
+   * settings from FileOutputCommitter and manifest committer.
+   * @param statisticName statistic name to use in duration tracking.
+   * @param conf configuration to use.
+   * @return the options to process
+   */
+  public static Arguments cleanupStageOptionsFromConfig(
+      String statisticName, Configuration conf) {
+
+    boolean enabled = !conf.getBoolean(FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED,
+        FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED_DEFAULT);
+    boolean suppressExceptions = conf.getBoolean(
+        FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED,
+        FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED_DEFAULT);
+    boolean deleteTaskAttemptDirsInParallel = conf.getBoolean(
+        OPT_CLEANUP_PARALLEL_DELETE,
+        OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT);
+    return new Arguments(
+        statisticName,
+        enabled,
+        deleteTaskAttemptDirsInParallel,
+        suppressExceptions
+    );
+  }
+
+  /**
+   * Enum of outcomes.
+   */
+  public enum Outcome {
+    DISABLED("Disabled", false),
+    NOTHING_TO_CLEAN_UP("Nothing to clean up", true),
+    PARALLEL_DELETE("Parallel Delete of Task Attempt Directories", true),
+    DELETED("Delete of job directory", true),
+    FAILURE("Delete failed", false);
+
+    private final String description;
+
+    private final boolean success;
+
+    Outcome(String description, boolean success) {
+      this.description = description;
+      this.success = success;
+    }
+
+    @Override
+    public String toString() {
+      return "Outcome{" + name() +
+          " '" + description + '\'' +
+          "}";
+    }
+
+    /**
+     * description.
+     * @return text for logging
+     */
+    public String getDescription() {
+      return description;
+    }
+
+    /**
+     * Was this a success?
+     * @return true if this outcome is good.
+     */
+    public boolean isSuccess() {
+      return success;
+    }
+  }
+
+  /**
+   * Result of the cleanup.
+   * If the outcome == FAILURE but exceptions were suppressed
+   * (which they are implicitly if an instance of this object
+   * is created and returned), then the exception
+   * MUST NOT be null.
+   */
+  public static final class Result {
+
+    /** Outcome. */
+    private final Outcome outcome;
+
+    /** Directory cleaned up. */
+    private final Path directory;
+
+    /**
+     * Number of delete calls made across all threads.
+     */
+    private final int deleteCalls;
+
+    /**
+     * Any IOE raised.
+     */
+    private final IOException exception;
+
+    public Result(
+        final Outcome outcome,
+        final Path directory,
+        final int deleteCalls,
+        IOException exception) {
+      this.outcome = requireNonNull(outcome, "outcome");
+      this.directory = directory;
+      this.deleteCalls = deleteCalls;
+      this.exception = exception;
+      if (outcome == Outcome.FAILURE) {
+        requireNonNull(exception, "No exception in failure result");
+      }
+    }
+
+    public Path getDirectory() {
+      return directory;
+    }
+
+    public boolean wasExecuted() {
+      return outcome != Outcome.DISABLED;
+    }
+
+    /**
+     * Was the outcome a success?
+     * That is: either the dir wasn't there or through
+     * delete/rename it is no longer there.
+     * @return true if the temporary dir no longer exists.
+     */
+    public boolean succeeded() {
+      return outcome.isSuccess();
+    }
+
+    public Outcome getOutcome() {
+      return outcome;
+    }
+
+    public int getDeleteCalls() {
+      return deleteCalls;
+    }
+
+    public IOException getException() {
+      return exception;
+    }
+
+    /**
+     * If there was an IOE caught, throw it.
+     * For ease of use in (meaningful) lambda expressions
+     * in tests, returns the string value if there
+     * was no exception to throw (for use in tests)
+     * @throws IOException exception.
+     */
+    public String maybeRethrowException() throws IOException {
+      if (exception != null) {
+        throw exception;
+      }
+      return toString();
+    }
+
+    @Override
+    public String toString() {
+      return "CleanupResult{" +
+          "outcome=" + outcome +
+          ", directory=" + directory +
+          ", deleteCalls=" + deleteCalls +
+          ", exception=" + exception +
+          '}';
+    }
+  }
+}

+ 245 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitJobStage.java

@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+import java.util.List;
+
+import javax.annotation.Nullable;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.commons.io.FileUtils.byteCountToDisplaySize;
+import static org.apache.commons.lang3.StringUtils.isNotBlank;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_BYTES_COMMITTED_COUNT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_FILES_COMMITTED_COUNT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.MANIFESTS;
+
+/**
+ * Commit the Job.
+ * Arguments (save manifest, validate output)
+ * Inputs: saveMarker: boolean, validateOutput: boolean
+ * Outputs: SuccessData
+ */
+public class CommitJobStage extends
+    AbstractJobOrTaskStage<
+            CommitJobStage.Arguments,
+            CommitJobStage.Result> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      CommitJobStage.class);
+
+  public CommitJobStage(final StageConfig stageConfig) {
+    super(false, stageConfig, OP_STAGE_JOB_COMMIT, true);
+  }
+
+  @Override
+  protected CommitJobStage.Result executeStage(
+      final CommitJobStage.Arguments arguments) throws IOException {
+
+    LOG.info("{}: Committing job \"{}\". resilient commit supported = {}",
+        getName(),
+        getJobId(),
+        storeSupportsResilientCommit());
+
+    boolean createMarker = arguments.isCreateMarker();
+
+    // load the manifests
+    final StageConfig stageConfig = getStageConfig();
+    LoadManifestsStage.Result result
+        = new LoadManifestsStage(stageConfig).apply(true);
+    List<TaskManifest> manifests = result.getManifests();
+    LoadManifestsStage.SummaryInfo summary = result.getSummary();
+
+    LOG.debug("{}: Job Summary {}", getName(), summary);
+    LOG.info("{}: Committing job with file count: {}; total size {} bytes",
+        getName(),
+        summary.getFileCount(),
+        byteCountToDisplaySize(summary.getTotalFileSize()));
+
+
+    // add in the manifest statistics to our local IOStatistics for
+    // reporting.
+    IOStatisticsStore iostats = getIOStatistics();
+    iostats.aggregate(summary.getIOStatistics());
+
+    // prepare destination directories.
+    final CreateOutputDirectoriesStage.Result dirStageResults =
+        new CreateOutputDirectoriesStage(stageConfig)
+            .apply(manifests);
+
+    // commit all the tasks.
+    // The success data includes a snapshot of the IO Statistics
+    // and hence all aggregate stats from the tasks.
+    ManifestSuccessData successData;
+    successData = new RenameFilesStage(stageConfig).apply(
+        Pair.of(manifests, dirStageResults.getCreatedDirectories()));
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("{}: _SUCCESS file summary {}", getName(), successData.toJson());
+    }
+    // update the counter of bytes committed and files.
+    // use setCounter so as to ignore any values accumulated when
+    // aggregating tasks.
+    iostats.setCounter(
+        COMMITTER_FILES_COMMITTED_COUNT,
+        summary.getFileCount());
+    iostats.setCounter(
+        COMMITTER_BYTES_COMMITTED_COUNT,
+        summary.getTotalFileSize());
+    successData.snapshotIOStatistics(iostats);
+
+
+    // rename manifests. Only warn on failure here.
+    final String manifestRenameDir = arguments.getManifestRenameDir();
+    if (isNotBlank(manifestRenameDir)) {
+      Path manifestRenamePath = new Path(
+          new Path(manifestRenameDir),
+          getJobId());
+      LOG.info("{}: Renaming manifests to {}", getName(), manifestRenamePath);
+      try {
+        renameDir(getTaskManifestDir(), manifestRenamePath);
+
+        // save this path in the summary diagnostics
+        successData.getDiagnostics().put(MANIFESTS, manifestRenamePath.toUri().toString());
+      } catch (IOException | IllegalArgumentException e) {
+        // rename failure, including path for wrong filesystem
+        LOG.warn("{}: Failed to rename manifests to {}", getName(), manifestRenamePath, e);
+      }
+    }
+
+    // save the _SUCCESS if the option is enabled.
+    Path successPath = null;
+    if (createMarker) {
+      // save a snapshot of the IO Statistics
+
+      successPath = new SaveSuccessFileStage(stageConfig)
+          .apply(successData);
+      LOG.debug("{}: Saving _SUCCESS file to {}", getName(), successPath);
+    }
+
+    // optional cleanup
+    new CleanupJobStage(stageConfig).apply(arguments.getCleanupArguments());
+
+    // and then, after everything else: optionally validate.
+    if (arguments.isValidateOutput()) {
+      // cache and restore the active stage field
+      LOG.info("{}: Validating output.", getName());
+      new ValidateRenamedFilesStage(stageConfig)
+          .apply(result.getManifests());
+    }
+
+    // restore the active stage so that when the report is saved
+    // it is declared as job commit, not cleanup or validate.
+    stageConfig.enterStage(getStageName(arguments));
+
+    // the result
+    return new CommitJobStage.Result(successPath, successData);
+  }
+
+  /**
+   * Arguments for job commit.
+   */
+  public static final class Arguments {
+
+    /** create the _SUCCESS marker? */
+    private final boolean createMarker;
+
+    /** perform validation checks on the files? */
+    private final boolean validateOutput;
+
+    /** optional directory to rename the task manifests to. */
+    private final String manifestRenameDir;
+
+    /** cleanup arguments.. */
+    private final CleanupJobStage.Arguments cleanupArguments;
+
+    /**
+     *
+     * @param createMarker create the _SUCCESS marker?
+     * @param validateOutput perform validation checks on the files?
+     * @param manifestRenameDir optional directory to rename the task manifests to
+     * @param cleanupArguments cleanup arguments.
+     */
+    public Arguments(
+        boolean createMarker,
+        boolean validateOutput,
+        @Nullable String manifestRenameDir,
+        CleanupJobStage.Arguments cleanupArguments) {
+
+      this.createMarker = createMarker;
+      this.validateOutput = validateOutput;
+      this.manifestRenameDir = manifestRenameDir;
+      this.cleanupArguments = requireNonNull(cleanupArguments);
+    }
+
+    public boolean isCreateMarker() {
+      return createMarker;
+    }
+
+    public boolean isValidateOutput() {
+      return validateOutput;
+    }
+
+    public String getManifestRenameDir() {
+      return manifestRenameDir;
+    }
+
+    public CleanupJobStage.Arguments getCleanupArguments() {
+      return cleanupArguments;
+    }
+  }
+
+  /**
+   * Result of the stage.
+   */
+  public static final class Result {
+    /**
+     * Manifest success data.
+     */
+    private final ManifestSuccessData jobSuccessData;
+
+    /**
+     * Success file path. null if not saved.
+     */
+    private final Path successPath;
+
+    public Result(final Path successPath,
+        ManifestSuccessData jobSuccessData) {
+      this.successPath = successPath;
+      this.jobSuccessData = jobSuccessData;
+    }
+
+    public ManifestSuccessData getJobSuccessData() {
+      return jobSuccessData;
+    }
+
+    public Path getSuccessPath() {
+      return successPath;
+    }
+  }
+}

+ 115 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitTaskStage.java

@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_COMMIT;
+
+/**
+ * Commit a task attempt.
+ * Scan the task attempt directories through
+ * {@link TaskAttemptScanDirectoryStage}
+ * and then save to the task manifest path at
+ * {@link SaveTaskManifestStage}.
+ */
+
+public class CommitTaskStage extends
+    AbstractJobOrTaskStage<Void, CommitTaskStage.Result> {
+  private static final Logger LOG = LoggerFactory.getLogger(
+      CommitTaskStage.class);
+
+  public CommitTaskStage(final StageConfig stageConfig) {
+    super(true, stageConfig, OP_STAGE_TASK_COMMIT, false);
+  }
+
+  /**
+   * Scan the task attempt dir then save the manifest.
+   * A snapshot of the IOStats will be included in the manifest;
+   * this includes the scan time.
+   * @param arguments arguments to the function.
+   * @return the path the manifest was saved to, and the manifest.
+   * @throws IOException IO failure.
+   */
+  @Override
+  protected CommitTaskStage.Result executeStage(final Void arguments)
+      throws IOException {
+    LOG.info("{}: Committing task \"{}\"", getName(), getTaskAttemptId());
+
+    // execute the scan
+    final TaskAttemptScanDirectoryStage scanStage =
+        new TaskAttemptScanDirectoryStage(getStageConfig());
+    TaskManifest manifest = scanStage.apply(arguments);
+
+    // add the scan as task commit. It's not quite, as it doesn't include
+    // the saving, but ...
+    scanStage.addExecutionDurationToStatistics(getIOStatistics(), OP_STAGE_TASK_COMMIT);
+
+    // save a snapshot of the IO Statistics
+    final IOStatisticsSnapshot manifestStats = snapshotIOStatistics();
+    manifestStats.aggregate(getIOStatistics());
+    manifest.setIOStatistics(manifestStats);
+
+    // Now save with rename
+    Path manifestPath = new SaveTaskManifestStage(getStageConfig())
+        .apply(manifest);
+    return new CommitTaskStage.Result(manifestPath, manifest);
+  }
+
+  /**
+   * Result of the stage.
+   */
+  public static final class Result {
+    /** The path the manifest was saved to. */
+    private final Path path;
+    /** The manifest. */
+    private final TaskManifest taskManifest;
+
+    public Result(Path path,
+        TaskManifest taskManifest) {
+      this.path = path;
+      this.taskManifest = taskManifest;
+    }
+
+    /**
+     * Get the manifest path.
+     * @return The path the manifest was saved to.
+     */
+    public Path getPath() {
+      return path;
+    }
+
+    /**
+     * Get the manifest.
+     * @return The manifest.
+     */
+    public TaskManifest getTaskManifest() {
+      return taskManifest;
+    }
+
+  }
+}

+ 423 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java

@@ -0,0 +1,423 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.EntryStatus;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.util.functional.TaskPool;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.measureDurationOfInvocation;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_CREATE_DIRECTORIES;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_DELETE_FILE_UNDER_DESTINATION;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_MKDIRS_RETURNED_FALSE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_PREPARE_DIR_ANCESTORS;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CREATE_TARGET_DIRS;
+import static org.apache.hadoop.util.OperationDuration.humanTime;
+
+/**
+ * Prepare the destination directory tree, as efficiently as possible.
+ * possible -and doing those IO operations in the thread pool.
+ *
+ * The classic FileOutputCommitter does a recursive treewalk and
+ * deletes any files found at paths where directories are to be created.
+ *
+ * Each task manifest's directories are combined with those of the other tasks
+ * to build a set of all directories which are needed, without duplicates.
+ *
+ * This stage requires the aggregate set of manifests to contain
+ * all directories to create, including level,
+ * and expects them to have been probed for existence/state.
+ *
+ * For each level, all dirs are processed in parallel to
+ * be created or, if files, deleted.
+ *
+ * The stage returns the list of directories created, and for testing,
+ * the map of paths to outcomes.
+ *
+ * Directory creation can be surprisingly slow against object stores,
+ * do use benchmarks from real test runs when tuning this algorithm.
+ */
+public class CreateOutputDirectoriesStage extends
+    AbstractJobOrTaskStage<
+        List<TaskManifest>,
+        CreateOutputDirectoriesStage.Result> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      CreateOutputDirectoriesStage.class);
+
+  /**
+   * Directories as a map of (path, path).
+   * Using a map rather than any set for efficient concurrency; the
+   * concurrent sets don't do lookups so fast.
+   */
+  private final Map<Path, DirMapState> dirMap = new ConcurrentHashMap<>();
+
+  /**
+   * A list of created paths for the results.
+   */
+  private final List<Path> createdDirectories = new ArrayList<>();
+
+  public CreateOutputDirectoriesStage(final StageConfig stageConfig) {
+    super(false, stageConfig, OP_STAGE_JOB_CREATE_TARGET_DIRS, true);
+    // add the dest dir to the dir map as we expect the job setup to create it.
+    dirMap.put(getDestinationDir(), DirMapState.dirWasCreated);
+  }
+
+  @Override
+  protected Result executeStage(
+      final List<TaskManifest> taskManifests)
+      throws IOException {
+
+    final List<Path> directories = createAllDirectories(taskManifests);
+    LOG.debug("{}: Created {} directories", getName(), directories.size());
+    return new Result(new HashSet<>(directories), dirMap);
+  }
+
+  /**
+   * For each task, build the list of directories it wants.
+   * @param taskManifests task manifests
+   * @return the list of paths which have been created.
+   */
+  private List<Path> createAllDirectories(final List<TaskManifest> taskManifests)
+      throws IOException {
+
+    // all directories which need to exist across all
+    // tasks.
+    // leaf directories
+    final Map<Path, DirEntry> leaves = new HashMap<>();
+    // parent directories. these do not need to be
+    // explicitly created.
+    final Map<Path, DirEntry> parents = new HashMap<>();
+    // the files which must be deleted as a directory
+    // will be created at that path.
+    final Set<Path> filesToDelete = new HashSet<>();
+
+    // iterate through the task manifests
+    // and all output dirs into the set of dirs to
+    // create.
+    // hopefully there is a lot of overlap, so the
+    // final number of dirs to create is small.
+    for (TaskManifest task: taskManifests) {
+      final List<DirEntry> destDirectories = task.getDestDirectories();
+      Collections.sort(destDirectories, (o1, o2) ->
+          o1.getLevel() - o2.getLevel());
+      for (DirEntry entry: destDirectories) {
+        // add the dest entry
+        final Path path = entry.getDestPath();
+        if (!leaves.containsKey(path)) {
+          leaves.put(path, entry);
+
+          // if it is a file to delete, record this.
+          if (entry.getStatus() == EntryStatus.file) {
+            filesToDelete.add(path);
+          }
+          final Path parent = path.getParent();
+          if (parent != null && leaves.containsKey(parent)) {
+            // there's a parent dir, move it from the leaf list
+            // to parent list
+            parents.put(parent,
+                leaves.remove(parent));
+          }
+        }
+      }
+    }
+
+    // at this point then there is a map of all directories which
+    // are leaf entries and so need to be created if not present,
+    // and the maximum level is known.
+    // we can iterate through all levels deleting any files if there are any.
+
+    // Prepare parent directories.
+    deleteFiles(filesToDelete);
+
+    // Now the real work.
+    final int createCount = leaves.size();
+    LOG.info("Preparing {} directory/directories", createCount);
+    // now probe for and create the leaf dirs, which are those at the
+    // bottom level
+    Duration d = measureDurationOfInvocation(getIOStatistics(), OP_CREATE_DIRECTORIES, () ->
+        TaskPool.foreach(leaves.values())
+            .executeWith(getIOProcessors(createCount))
+            .onFailure(this::reportMkDirFailure)
+            .stopOnFailure()
+            .run(this::createOneDirectory));
+    LOG.info("Time to prepare directories {}", humanTime(d.toMillis()));
+    return createdDirectories;
+  }
+
+  /**
+   * How many failures have been reported.
+   */
+  private final AtomicInteger failureCount = new AtomicInteger();
+
+  /**
+   * report a single directory failure.
+   * @param path path which could not be deleted
+   * @param e exception raised.
+   */
+  private void reportMkDirFailure(DirEntry dirEntry, Exception e) {
+    Path path = dirEntry.getDestPath();
+    final int count = failureCount.incrementAndGet();
+    LOG.warn("{}: mkdir failure #{} Failed to create directory \"{}\": {}",
+        getName(), count, path, e.toString());
+    LOG.debug("{}: Full exception details",
+        getName(), e);
+  }
+
+  /**
+   * Delete all directories where there is a file.
+   * @param filesToDelete set of dirs to where there is a file.
+   * @throws IOException IO problem
+   */
+  private void deleteFiles(final Set<Path> filesToDelete)
+      throws IOException {
+
+    final int size = filesToDelete.size();
+    if (size == 0) {
+      // nothing to delete.
+      return;
+    }
+    LOG.info("{}: Directory entries containing files to delete: {}", getName(), size);
+    Duration d = measureDurationOfInvocation(getIOStatistics(),
+        OP_PREPARE_DIR_ANCESTORS, () ->
+            TaskPool.foreach(filesToDelete)
+                .executeWith(getIOProcessors(size))
+                .stopOnFailure()
+                .run(dir -> {
+                  updateAuditContext(OP_PREPARE_DIR_ANCESTORS);
+                  deleteDirWithFile(dir);
+                }));
+    LOG.info("Time to delete files {}", humanTime(d.toMillis()));
+  }
+
+  /**
+   * Prepare a parent directory.
+   * @param dir directory to probe
+   * @throws IOException failure in probe other than FNFE
+   */
+  private void deleteDirWithFile(Path dir) throws IOException {
+    // report progress back
+    progress();
+    LOG.info("{}: Deleting file {}", getName(), dir);
+    delete(dir, false, OP_DELETE);
+    // note its final state
+    addToDirectoryMap(dir, DirMapState.fileNowDeleted);
+  }
+
+
+  /**
+   * Create a directory is required, updating the directory map
+   * and, if the operation took place, the list of created dirs.
+   * Reports progress on invocation.
+   * @param dirEntry entry
+   * @throws IOException failure.
+   */
+  private void createOneDirectory(final DirEntry dirEntry) throws IOException {
+    // report progress back
+    progress();
+    final Path dir = dirEntry.getDestPath();
+    updateAuditContext(OP_STAGE_JOB_CREATE_TARGET_DIRS);
+    final DirMapState state = maybeCreateOneDirectory(dirEntry);
+    switch (state) {
+    case dirFoundInStore:
+      addToDirectoryMap(dir, state);
+      break;
+    case dirWasCreated:
+    case dirCreatedOnSecondAttempt:
+      addCreatedDirectory(dir);
+      addToDirectoryMap(dir, state);
+      break;
+    default:
+      break;
+    }
+
+  }
+
+
+  /**
+   * Try to efficiently and robustly create a directory in a method which is
+   * expected to be executed in parallel with operations creating
+   * peer directories.
+   * @param path path to create
+   * @return true if dir created/found
+   * @throws IOException IO Failure.
+   */
+  private DirMapState maybeCreateOneDirectory(DirEntry dirEntry) throws IOException {
+    final EntryStatus status = dirEntry.getStatus();
+    if (status == EntryStatus.dir) {
+      return DirMapState.dirFoundInStore;
+    }
+    // present in case directories are ever created in task commits
+    if (status == EntryStatus.created_dir) {
+      return DirMapState.dirWasCreated;
+    }
+
+    // here the dir doesn't exist because
+    // it was a file and has been deleted, or
+    // checks failed. create it.
+    final Path path = dirEntry.getDestPath();
+
+    LOG.info("Creating directory {}", path);
+
+    try {
+      if (mkdirs(path, false)) {
+        // success -return immediately.
+        return DirMapState.dirWasCreated;
+      }
+      getIOStatistics().incrementCounter(OP_MKDIRS_RETURNED_FALSE);
+
+      LOG.info("{}: mkdirs({}) returned false, attempting to recover",
+          getName(), path);
+    } catch (IOException e) {
+      // can be caused by file existing, etc.
+      LOG.info("{}: mkdir({}) raised exception {}", getName(), path, e.toString());
+      LOG.debug("{}: Mkdir stack", getName(), e);
+    }
+
+    // fallback to checking the FS, in case a different process did it.
+    FileStatus st = getFileStatusOrNull(path);
+    if (st != null) {
+      if (!st.isDirectory()) {
+        // is bad: delete a file
+        LOG.info("{}: Deleting file where a directory should go: {}",
+            getName(), st);
+        delete(path, false, OP_DELETE_FILE_UNDER_DESTINATION);
+      } else {
+        // is good.
+        LOG.warn("{}: Even though mkdirs({}) failed, there is now a directory there",
+            getName(), path);
+        return DirMapState.dirFoundInStore;
+      }
+    } else {
+      // nothing found. This should never happen.
+      LOG.warn("{}: Although mkdirs({}) returned false, there's nothing at that path to prevent it",
+          getName(), path);
+
+    }
+
+    // try to create the directory again
+    // if this fails, and IOE is still raised, that
+    // propagate to the caller.
+    if (!mkdirs(path, false)) {
+
+      // mkdirs failed again
+      getIOStatistics().incrementCounter(OP_MKDIRS_RETURNED_FALSE);
+
+      // require the dir to exist, raising an exception if it does not.
+      directoryMustExist("Creating directory ", path);
+    }
+
+    // we only get here if the second attempt recovered
+    return DirMapState.dirCreatedOnSecondAttempt;
+
+  }
+
+  /**
+   * Add a created dir to the list of created dirs.
+   * @param dir new dir.
+   */
+  private void addCreatedDirectory(final Path dir) {
+    synchronized (createdDirectories) {
+      createdDirectories.add(dir);
+    }
+  }
+
+  /**
+   * Add a dir  to the directory map if there is not already an entry there.
+   * @param dir directory.
+   * @param state state of entry
+   */
+  private void addToDirectoryMap(final Path dir,
+      DirMapState state) {
+    if (!dirMap.containsKey(dir)) {
+      dirMap.put(dir, state);
+    }
+  }
+
+
+  /**
+   * Result of the operation.
+   */
+  public static final class Result {
+
+    /** directories created. */
+    private final Set<Path> createdDirectories;
+
+    /**
+     * Map of dirs built up during preparation.
+     */
+    private final Map<Path, DirMapState> dirMap;
+
+    public Result(Set<Path> createdDirectories,
+        Map<Path, DirMapState> dirMap) {
+      this.createdDirectories = requireNonNull(createdDirectories);
+      this.dirMap = requireNonNull(dirMap);
+    }
+
+    public Set<Path> getCreatedDirectories() {
+      return createdDirectories;
+    }
+
+    public Map<Path, DirMapState> getDirMap() {
+      return dirMap;
+    }
+
+    @Override
+    public String toString() {
+      return "Result{" +
+          "directory count=" + createdDirectories.size() +
+          '}';
+    }
+  }
+
+  /**
+   * Enumeration of dir states in the dir map.
+   */
+  public enum DirMapState {
+    dirFoundInStore,
+    dirFoundInMap,
+    dirWasCreated,
+    dirCreatedOnSecondAttempt,
+    fileNowDeleted,
+    ancestorWasDirOrMissing,
+    parentWasNotFile,
+    parentOfCreatedDir
+  }
+
+}

+ 33 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/JobOrTaskStage.java

@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.util.functional.FunctionRaisingIOE;
+
+/**
+ * Job Stage is a function raising an IOE. and which also
+ * provides IOStats.
+ * @param <IN> Type of arguments to the stage.
+ * @param <OUT> Type of result.
+ */
+public interface JobOrTaskStage<IN, OUT> extends FunctionRaisingIOE<IN, OUT>,
+    IOStatisticsSource {
+
+}

+ 293 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/LoadManifestsStage.java

@@ -0,0 +1,293 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.util.functional.TaskPool;
+
+import static org.apache.commons.io.FileUtils.byteCountToDisplaySize;
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfInvocation;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_MANIFEST_FILE_SIZE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_LOAD_ALL_MANIFESTS;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_LOAD_MANIFESTS;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.maybeAddIOStatistics;
+
+/**
+ * Stage to load all the task manifests in the job attempt directory.
+ * Invoked in Job Commit.
+ * Manifests are loaded in parallel.
+ * The IOStatistics snapshot passed in is built up with the statistics
+ * and the statistics stripped from the manifest if prune == true.
+ * This keeps the memory footprint of each manifest down.
+ */
+public class LoadManifestsStage extends
+    AbstractJobOrTaskStage<
+        Boolean,
+        LoadManifestsStage.Result> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      LoadManifestsStage.class);
+
+  /**
+   * Summary of manifest loading.
+   */
+  private final SummaryInfo summaryInfo = new SummaryInfo();
+
+  /**
+   * Should manifests be pruned of IOStatistics?
+   */
+  private boolean pruneManifests;
+
+  /**
+   * List of loaded manifests.
+   */
+  private final List<TaskManifest> manifests = new ArrayList<>();
+
+  public LoadManifestsStage(final StageConfig stageConfig) {
+    super(false, stageConfig, OP_STAGE_JOB_LOAD_MANIFESTS, true);
+  }
+
+  /**
+   * Load the manifests.
+   * @param prune should manifests be pruned of IOStatistics?
+   * @return the summary and a list of manifests.
+   * @throws IOException IO failure.
+   */
+  @Override
+  protected LoadManifestsStage.Result executeStage(
+      final Boolean prune) throws IOException {
+
+    final Path manifestDir = getTaskManifestDir();
+    LOG.info("{}: Executing Manifest Job Commit with manifests in {}",
+        getName(),
+        manifestDir);
+    pruneManifests = prune;
+    // build a list of all task manifests successfully committed
+    //
+    msync(manifestDir);
+    final RemoteIterator<FileStatus> manifestFiles = listManifests();
+
+    final List<TaskManifest> manifestList = loadAllManifests(manifestFiles);
+    LOG.info("{}: Summary of {} manifests loaded in {}: {}",
+        getName(),
+        manifestList.size(),
+        manifestDir,
+        summaryInfo);
+
+    // collect any stats
+    maybeAddIOStatistics(getIOStatistics(), manifestFiles);
+    return new LoadManifestsStage.Result(summaryInfo, manifestList);
+  }
+
+  /**
+   * Load all the manifests.
+   * @param manifestFiles list of manifest files.
+   * @return the loaded manifests.
+   * @throws IOException IO Failure.
+   */
+  private List<TaskManifest> loadAllManifests(
+      final RemoteIterator<FileStatus> manifestFiles) throws IOException {
+
+    trackDurationOfInvocation(getIOStatistics(), OP_LOAD_ALL_MANIFESTS, () ->
+        TaskPool.foreach(manifestFiles)
+            .executeWith(getIOProcessors())
+            .stopOnFailure()
+            .run(this::processOneManifest));
+    return manifests;
+  }
+
+  /**
+   * Method invoked to process one manifest.
+   * @param status file to process.
+   * @throws IOException failure to load/parse
+   */
+  private void processOneManifest(FileStatus status)
+      throws IOException {
+    updateAuditContext(OP_LOAD_ALL_MANIFESTS);
+
+    TaskManifest m = fetchTaskManifest(status);
+    progress();
+
+    // update the manifest list in a synchronized block.
+
+    synchronized (manifests) {
+      manifests.add(m);
+      // and the summary info in the same block, to
+      // eliminate the need to acquire a second lock.
+      summaryInfo.add(m);
+    }
+    if (pruneManifests) {
+      m.setIOStatistics(null);
+      m.getExtraData().clear();
+    }
+  }
+
+  /**
+   * Precommit preparation of a single manifest file.
+   * To reduce the memory foot print, the IOStatistics and
+   * extra data of each manifest is cleared.
+   * @param status status of file.
+   * @return number of files.
+   * @throws IOException IO Failure.
+   */
+  private TaskManifest fetchTaskManifest(FileStatus status)
+      throws IOException {
+    if (status.getLen() == 0 || !status.isFile()) {
+      throw new PathIOException(status.getPath().toString(),
+          "Not a valid manifest file; file status = " + status);
+    }
+    // load the manifest, which includes validation.
+    final TaskManifest manifest = loadManifest(status);
+    final String id = manifest.getTaskAttemptID();
+    final int filecount = manifest.getFilesToCommit().size();
+    final long size = manifest.getTotalFileSize();
+    LOG.info("{}: Task Attempt {} file {}: File count: {}; data size={}",
+        getName(), id, status.getPath(), filecount, size);
+    // record file size for tracking of memory consumption.
+    getIOStatistics().addMeanStatisticSample(COMMITTER_TASK_MANIFEST_FILE_SIZE,
+        status.getLen());
+    return manifest;
+  }
+
+  /**
+   * Result of the stage.
+   */
+  public static final class Result {
+    private final SummaryInfo summary;
+
+    private final List<TaskManifest> manifests;
+
+    public Result(SummaryInfo summary,
+        List<TaskManifest> manifests) {
+      this.summary = summary;
+      this.manifests = manifests;
+    }
+
+    public SummaryInfo getSummary() {
+      return summary;
+    }
+
+    public List<TaskManifest> getManifests() {
+      return manifests;
+    }
+
+  }
+
+  /**
+   * Summary information.
+   */
+  public static final class SummaryInfo implements IOStatisticsSource {
+
+    /**
+     * Aggregate IOStatistics.
+     */
+    private IOStatisticsSnapshot iostatistics = snapshotIOStatistics();
+
+    /**
+     * How many manifests were loaded.
+     */
+    private long manifestCount;
+
+    /**
+     * Total number of files to rename.
+     */
+    private long fileCount;
+
+    /**
+     * Total number of directories which may need
+     * to be created.
+     * As there is no dedup, this is likely to be
+     * a (major) overestimate.
+     */
+    private long directoryCount;
+
+    /**
+     * Total amount of data to be committed.
+     */
+    private long totalFileSize;
+
+    /**
+     * Get the IOStatistics.
+     * @return aggregate IOStatistics
+     */
+    @Override
+    public IOStatisticsSnapshot getIOStatistics() {
+      return iostatistics;
+    }
+
+    public long getFileCount() {
+      return fileCount;
+    }
+
+    public long getDirectoryCount() {
+      return directoryCount;
+    }
+
+    public long getTotalFileSize() {
+      return totalFileSize;
+    }
+
+    public long getManifestCount() {
+      return manifestCount;
+    }
+
+    /**
+     * Add all statistics.
+     * @param manifest manifest to add.
+     */
+    public void add(TaskManifest manifest) {
+      manifestCount++;
+      iostatistics.aggregate(manifest.getIOStatistics());
+      fileCount += manifest.getFilesToCommit().size();
+      directoryCount += manifest.getDestDirectories().size();
+      totalFileSize += manifest.getTotalFileSize();
+    }
+
+    /**
+     * To String includes all summary info except statistics.
+     * @return string value
+     */
+    @Override
+    public String toString() {
+      final StringBuilder sb = new StringBuilder(
+          "SummaryInfo{");
+      sb.append("manifestCount=").append(manifestCount);
+      sb.append(", fileCount=").append(fileCount);
+      sb.append(", directoryCount=").append(directoryCount);
+      sb.append(", totalFileSize=").append(
+          byteCountToDisplaySize(totalFileSize));
+      sb.append('}');
+      return sb.toString();
+    }
+  }
+}

+ 173 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/RenameFilesStage.java

@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.util.functional.TaskPool;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER_FILE_LIMIT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_RENAME_FILES;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createManifestOutcome;
+import static org.apache.hadoop.thirdparty.com.google.common.collect.Iterables.concat;
+
+/**
+ * This stage renames all the files.
+ * Input: the manifests and the set of directories created, as returned by
+ * {@link CreateOutputDirectoriesStage}.
+ * If the job is configured to delete target files, if the parent dir
+ * had to be created, the delete() call can be skipped.
+ * It returns a manifest success data file summarizing the
+ * output, but does not add iostatistics to it.
+ */
+public class RenameFilesStage extends
+    AbstractJobOrTaskStage<
+        Pair<List<TaskManifest>, Set<Path>>,
+        ManifestSuccessData> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      RenameFilesStage.class);
+
+  /**
+   * List of all files committed.
+   */
+  private final List<FileEntry> filesCommitted = new ArrayList<>();
+
+  /**
+   * Total file size.
+   */
+  private long totalFileSize = 0;
+
+  private Set<Path> createdDirectories;
+
+  public RenameFilesStage(final StageConfig stageConfig) {
+    super(false, stageConfig, OP_STAGE_JOB_RENAME_FILES, true);
+  }
+
+  /**
+   * Get the list of files committed.
+   * Access is not synchronized.
+   * @return direct access to the list of files.
+   */
+  public synchronized  List<FileEntry> getFilesCommitted() {
+    return filesCommitted;
+  }
+
+  /**
+   * Get the total file size of the committed task.
+   * @return a number greater than or equal to zero.
+   */
+  public synchronized long getTotalFileSize() {
+    return totalFileSize;
+  }
+
+  /**
+   * Rename files in job commit.
+   * @param taskManifests a list of task manifests containing files.
+   * @return the job report.
+   * @throws IOException failure
+   */
+  @Override
+  protected ManifestSuccessData executeStage(
+      Pair<List<TaskManifest>, Set<Path>> args)
+      throws IOException {
+
+    final List<TaskManifest> taskManifests = args.getLeft();
+    createdDirectories = args.getRight();
+
+    final ManifestSuccessData success = createManifestOutcome(getStageConfig(),
+        OP_STAGE_JOB_COMMIT);
+    final int manifestCount = taskManifests.size();
+
+    LOG.info("{}: Executing Manifest Job Commit with {} manifests in {}",
+        getName(), manifestCount, getTaskManifestDir());
+
+    // first step is to aggregate the output of all manifests into a single
+    // list of files to commit.
+    // Which Guava can do in a zero-copy concatenated iterator
+
+    final Iterable<FileEntry> filesToCommit = concat(taskManifests.stream()
+        .map(TaskManifest::getFilesToCommit)
+        .collect(Collectors.toList()));
+
+    TaskPool.foreach(filesToCommit)
+        .executeWith(getIOProcessors())
+        .stopOnFailure()
+        .run(this::commitOneFile);
+
+    // synchronized block to keep spotbugs happy.
+    List<FileEntry> committed = getFilesCommitted();
+    LOG.info("{}: Files committed: {}. Total size {}",
+        getName(), committed.size(), getTotalFileSize());
+
+    // Add a subset of the destination files to the success file;
+    // enough for simple testing
+    success.setFilenamePaths(
+        committed
+            .subList(0, Math.min(committed.size(), SUCCESS_MARKER_FILE_LIMIT))
+            .stream().map(FileEntry::getDestPath)
+            .collect(Collectors.toList()));
+
+    success.setSuccess(true);
+
+    return success;
+  }
+
+  /**
+   * Commit one file by rename, then, if that doesn't fail,
+   * add to the files committed list.
+   * @param entry entry to commit.
+   * @throws IOException faiure.
+   */
+  private void commitOneFile(FileEntry entry) throws IOException {
+    updateAuditContext(OP_STAGE_JOB_RENAME_FILES);
+
+    // report progress back
+    progress();
+
+    // if the dest dir is to be deleted,
+    // look to see if the parent dir was created.
+    // if it was. we know that the file doesn't exist.
+    final boolean deleteDest = getStageConfig().getDeleteTargetPaths()
+        && !createdDirectories.contains(entry.getDestPath().getParent());
+    // do the rename
+    commitFile(entry, deleteDest);
+
+    // update the list and IOStats
+    synchronized (this) {
+      filesCommitted.add(entry);
+      totalFileSize += entry.getSize();
+    }
+
+  }
+
+}

+ 79 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveSuccessFileStage.java

@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.TMP_SUFFIX;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_SAVE_SUCCESS;
+
+/**
+ * Save the _SUCCESS file to the destination directory
+ * via a temp file in the job attempt dir.
+ * Returns the path of the file
+ */
+public class SaveSuccessFileStage extends
+    AbstractJobOrTaskStage<ManifestSuccessData, Path> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      SaveSuccessFileStage.class);
+
+  public SaveSuccessFileStage(final StageConfig stageConfig) {
+    super(false, stageConfig, OP_STAGE_JOB_SAVE_SUCCESS, false);
+  }
+
+  /**
+   * Stage name is always job commit.
+   * @param arguments args to the invocation.
+   * @return stage name
+   */
+  @Override
+  protected String getStageName(ManifestSuccessData arguments) {
+    // set it to the job commit stage, always.
+    return OP_STAGE_JOB_COMMIT;
+  }
+
+  /**
+   * Execute.
+   * @param successData success data to save
+   * @return path saved to.
+   * @throws IOException failure
+   */
+  @Override
+  protected Path executeStage(final ManifestSuccessData successData)
+      throws IOException {
+    // Save the marker
+    Path successFile = getStageConfig().getJobSuccessMarkerPath();
+    Path successTempFile = new Path(getJobAttemptDir(), SUCCESS_MARKER + TMP_SUFFIX);
+    LOG.debug("{}: Saving _SUCCESS file to {} via {}", successFile,
+        getName(),
+        successTempFile);
+    save(successData, successTempFile, successFile);
+    return successFile;
+  }
+
+}

+ 81 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java

@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_SAVE_MANIFEST;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestPathForTask;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestTempPathForTaskAttempt;
+
+/**
+ * Save a task manifest to the job attempt dir, using the task
+ * ID for the name of the final file.
+ * For atomic writes, the manifest is saved
+ * by writing to a temp file and then renaming it.
+ * Uses both the task ID and task attempt ID to determine the temp filename;
+ * Before the rename of (temp, final-path), any file at the final path
+ * is deleted.
+ * This is so that when this stage is invoked in a task commit, its output
+ * overwrites any of the first commit.
+ * When it succeeds, therefore, unless there is any subsequent commit of
+ * another task, the task manifest at the final path is from this
+ * operation.
+ *
+ * Returns the path where the manifest was saved.
+ */
+public class SaveTaskManifestStage extends
+    AbstractJobOrTaskStage<TaskManifest, Path> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      SaveTaskManifestStage.class);
+
+  public SaveTaskManifestStage(final StageConfig stageConfig) {
+    super(true, stageConfig, OP_STAGE_TASK_SAVE_MANIFEST, false);
+  }
+
+  /**
+   * Save the manifest to a temp file and rename to the final
+   * manifest destination.
+   * @param manifest manifest
+   * @return the path to the final entry
+   * @throws IOException IO failure.
+   */
+  @Override
+  protected Path executeStage(final TaskManifest manifest)
+      throws IOException {
+
+    final Path manifestDir = getTaskManifestDir();
+    // final manifest file is by task ID
+    Path manifestFile = manifestPathForTask(manifestDir,
+        getRequiredTaskId());
+    Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir,
+        getRequiredTaskAttemptId());
+    LOG.info("{}: Saving manifest file to {}", getName(), manifestFile);
+    save(manifest, manifestTempFile, manifestFile);
+    return manifestFile;
+  }
+
+}

+ 63 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupJobStage.java

@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.Path;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_SETUP;
+
+/**
+ * Stage to set up a job by creating the job attempt directory.
+ * The job attempt directory must not exist before the call.
+ */
+public class SetupJobStage extends
+    AbstractJobOrTaskStage<Boolean, Path> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      SetupJobStage.class);
+
+  public SetupJobStage(final StageConfig stageConfig) {
+    super(false, stageConfig, OP_STAGE_JOB_SETUP, false);
+  }
+
+  /**
+   * Execute the job setup stage.
+   * @param deleteMarker: should any success marker be deleted.
+   * @return the job attempted directory.
+   * @throws IOException failure.
+   */
+  @Override
+  protected Path executeStage(final Boolean deleteMarker) throws IOException {
+    final Path path = getJobAttemptDir();
+    LOG.info("{}: Creating Job Attempt directory {}", getName(), path);
+    createNewDirectory("Job setup", path);
+    createNewDirectory("Creating task manifest dir", getTaskManifestDir());
+    // delete any success marker if so instructed.
+    if (deleteMarker) {
+      delete(getStageConfig().getJobSuccessMarkerPath(), false);
+    }
+    return path;
+  }
+
+}

+ 54 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupTaskStage.java

@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.Path;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_SETUP;
+
+/**
+ * Stage to set up task.
+ * This creates the task attempt directory, after verifying
+ * that the job attempt dir exists (i.e. this is invoked
+ * after the job is started and before any cleanup.
+ * Argument passed in is task name:only for logging.
+ */
+public class SetupTaskStage extends
+    AbstractJobOrTaskStage<String, Path> {
+
+  public SetupTaskStage(final StageConfig stageConfig) {
+    super(true, stageConfig, OP_STAGE_TASK_SETUP, false);
+  }
+
+  /**
+   * Set up a task.
+   * @param name task name (for logging)
+   * @return task attempt directory
+   * @throws IOException IO failure.
+   */
+  @Override
+  protected Path executeStage(final String name) throws IOException {
+    return createNewDirectory("Task setup " + name,
+        requireNonNull(getTaskAttemptDir(), "No task attempt directory"));
+  }
+
+}

+ 556 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java

@@ -0,0 +1,556 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
+import org.apache.hadoop.util.JsonSerialization;
+import org.apache.hadoop.util.Preconditions;
+import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.functional.TaskPool;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER;
+
+/**
+ * Stage Config.
+ * Everything to configure a stage which is common to all.
+ *
+ * It's isolated from the details of MR datatypes (taskID, taskattempt etc);
+ * at this point it expects parsed values.
+ *
+ * It uses the builder API, but once {@link #build()} is called it goes
+ * read only. This is to ensure that changes cannot
+ * take place when shared across stages.
+ */
+public class StageConfig {
+
+  /**
+   * A flag which freezes the config for
+   * further updates.
+   */
+  private boolean frozen;
+
+  /**
+   * IOStatistics to update.
+   */
+  private IOStatisticsStore iostatistics;
+
+  /**
+   * Job ID; constant over multiple attempts.
+   */
+  private String jobId;
+
+  /**
+   * Where did the job Unique ID come from?
+   */
+  private String jobIdSource = "";
+
+  /**
+   * Number of the job attempt; starts at zero.
+   */
+  private int jobAttemptNumber;
+
+  /**
+   * ID of the task.
+   */
+  private String taskId;
+
+  /**
+   * ID of this specific attempt at a task.
+   */
+  private String taskAttemptId;
+
+  /**
+   * Destination of job.
+   */
+  private Path destinationDir;
+
+  /**
+   * Job attempt dir.
+   */
+  private Path jobAttemptDir;
+
+  /**
+   * temp directory under job dest dir.
+   */
+  private Path outputTempSubDir;
+
+  /**
+   * Task attempt dir.
+   */
+  private Path taskAttemptDir;
+
+  /**
+   * directory where task manifests must go.
+   */
+  private Path taskManifestDir;
+
+  /**
+   * Subdir under the job attempt dir where task
+   * attempts will have subdirectories.
+   */
+  private Path jobAttemptTaskSubDir;
+
+  /**
+   * Callbacks to update store.
+   * This is not made visible to the stages; they must
+   * go through the superclass which
+   * adds statistics and logging.
+   */
+  private ManifestStoreOperations operations;
+
+  /**
+   * Submitter for doing IO against the store other than
+   * manifest processing.
+   */
+  private TaskPool.Submitter ioProcessors;
+
+  /**
+   * Optional progress callback.
+   */
+  private Progressable progressable;
+
+  /**
+   * Callback when a stage is entered.
+   */
+  private StageEventCallbacks enterStageEventHandler;
+
+  /**
+   * Thread local serializer; created on demand
+   * and shareable across a sequence of stages.
+   */
+  private final ThreadLocal<JsonSerialization<TaskManifest>> threadLocalSerializer =
+      ThreadLocal.withInitial(TaskManifest::serializer);
+
+  /**
+   * Delete target paths on commit? Stricter, but
+   * higher IO cost.
+   */
+  private boolean deleteTargetPaths;
+
+  /**
+   * Name for logging.
+   */
+  private String name = "";
+
+  public StageConfig() {
+  }
+
+  /**
+   * Verify that the config is not yet frozen.
+   */
+  private void checkOpen() {
+    Preconditions.checkState(!frozen,
+        "StageConfig is now read-only");
+  }
+
+  /**
+   * The build command makes the config immutable.
+   * Idempotent.
+   * @return the now-frozen config
+   */
+  public StageConfig build() {
+    frozen = true;
+    return this;
+  }
+
+  /**
+   * Set job destination dir.
+   * @param dir new dir
+   * @return this
+   */
+  public StageConfig withDestinationDir(final Path dir) {
+    destinationDir = dir;
+    return this;
+  }
+
+  /**
+   * Set IOStatistics store.
+   * @param store new store
+   * @return this
+   */
+  public StageConfig withIOStatistics(final IOStatisticsStore store) {
+    checkOpen();
+    iostatistics = store;
+    return this;
+  }
+
+  /**
+   * Set builder value.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withIOProcessors(final TaskPool.Submitter value) {
+    checkOpen();
+    ioProcessors = value;
+    return this;
+  }
+
+  /**
+   * Set Job attempt directory.
+   * @param dir new dir
+   * @return this
+   */
+  public StageConfig withJobAttemptDir(final Path dir) {
+    checkOpen();
+    jobAttemptDir = dir;
+    return this;
+  }
+
+  /**
+   * Directory to put task manifests into.
+   * @return a path under the job attempt dir.
+   */
+  public Path getTaskManifestDir() {
+    return taskManifestDir;
+  }
+
+  /**
+   * Set builder value.
+   * @param value new value
+   * @return the builder
+   */
+  public StageConfig withTaskManifestDir(Path value) {
+    checkOpen();
+    taskManifestDir = value;
+    return this;
+  }
+
+  /**
+   * Set builder value.
+   * @param value new value
+   * @return the builder
+   */
+  public StageConfig withJobAttemptTaskSubDir(Path value) {
+    jobAttemptTaskSubDir = value;
+    return this;
+  }
+
+  /**
+   * Get the path to the subdirectory under $jobID where task
+   * attempts are. List this dir to find all task attempt dirs.
+   * @return a path under the job attempt dir.
+   */
+  public Path getJobAttemptTaskSubDir() {
+    return jobAttemptTaskSubDir;
+  }
+
+  /**
+   * Set the job directories from the attempt directories
+   * information. Does not set task attempt fields.
+   * @param dirs source of directories.
+   * @return this
+   */
+  public StageConfig withJobDirectories(
+      final ManifestCommitterSupport.AttemptDirectories dirs) {
+
+    checkOpen();
+    withJobAttemptDir(dirs.getJobAttemptDir())
+        .withJobAttemptTaskSubDir(dirs.getJobAttemptTaskSubDir())
+        .withDestinationDir(dirs.getOutputPath())
+        .withOutputTempSubDir(dirs.getOutputTempSubDir())
+        .withTaskManifestDir(dirs.getTaskManifestDir());
+
+    return this;
+  }
+
+  /**
+   * Set job ID with no attempt included.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withJobId(final String value) {
+    checkOpen();
+    jobId = value;
+    return this;
+  }
+
+  public Path getOutputTempSubDir() {
+    return outputTempSubDir;
+  }
+
+  /**
+   * Set builder value.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withOutputTempSubDir(final Path value) {
+    checkOpen();
+    outputTempSubDir = value;
+    return this;
+  }
+
+  /**
+   * Set builder value.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withOperations(final ManifestStoreOperations value) {
+    checkOpen();
+    operations = value;
+    return this;
+  }
+
+  /**
+   * Set builder value.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withTaskAttemptId(final String value) {
+    checkOpen();
+    taskAttemptId = value;
+    return this;
+  }
+
+  /**
+   * Set builder value.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withTaskId(final String value) {
+    checkOpen();
+    taskId = value;
+    return this;
+  }
+
+  /**
+   * Set handler for stage entry events..
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withStageEventCallbacks(StageEventCallbacks value) {
+    checkOpen();
+    enterStageEventHandler = value;
+    return this;
+  }
+
+  /**
+   * Optional progress callback.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withProgressable(final Progressable value) {
+    checkOpen();
+    progressable = value;
+    return this;
+  }
+
+  /**
+   * Set the Task attempt directory.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withTaskAttemptDir(final Path value) {
+    checkOpen();
+    taskAttemptDir = value;
+    return this;
+  }
+
+  /**
+   * Set the job attempt number.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withJobAttemptNumber(final int value) {
+    checkOpen();
+    jobAttemptNumber = value;
+    return this;
+  }
+
+  /**
+   * Set the Job ID source.
+   * @param value new value
+   * @return this
+   */
+  public StageConfig withJobIdSource(final String value) {
+    checkOpen();
+    jobIdSource = value;
+    return this;
+  }
+
+  /**
+   * Set name of task/job.
+   * @param value new value
+   * @return the builder
+   */
+  public StageConfig withName(String value) {
+    name = value;
+    return this;
+  }
+
+  /**
+   * Get name of task/job.
+   * @return name for logging.
+   */
+  public String getName() {
+    return name;
+  }
+
+  /**
+   * Handler for stage entry events.
+   * @return the handler.
+   */
+  public StageEventCallbacks getEnterStageEventHandler() {
+    return enterStageEventHandler;
+  }
+
+  /**
+   * IOStatistics to update.
+   */
+  public IOStatisticsStore getIOStatistics() {
+    return iostatistics;
+  }
+
+  /**
+   * Job ID.
+   */
+  public String getJobId() {
+    return jobId;
+  }
+
+  /**
+   * ID of the task.
+   */
+  public String getTaskId() {
+    return taskId;
+  }
+
+  /**
+   * ID of this specific attempt at a task.
+   */
+  public String getTaskAttemptId() {
+    return taskAttemptId;
+  }
+
+  /**
+   * Job attempt dir.
+   */
+  public Path getJobAttemptDir() {
+    return jobAttemptDir;
+  }
+
+  /**
+   * Destination of job.
+   */
+  public Path getDestinationDir() {
+    return destinationDir;
+  }
+
+  /**
+   * Get the location of the success marker.
+   * @return a path under the destination directory.
+   */
+  public Path getJobSuccessMarkerPath() {
+    return new Path(destinationDir, SUCCESS_MARKER);
+  }
+
+  /**
+   * Callbacks to update store.
+   * This is not made visible to the stages; they must
+   * go through the wrapper classes in this class, which
+   * add statistics and logging.
+   */
+  public ManifestStoreOperations getOperations() {
+    return operations;
+  }
+
+  /**
+   * Submitter for doing IO against the store other than
+   * manifest processing.
+   */
+  public TaskPool.Submitter getIoProcessors() {
+    return ioProcessors;
+  }
+
+  /**
+   * Get optional progress callback.
+   * @return callback or null
+   */
+  public Progressable getProgressable() {
+    return progressable;
+  }
+
+  /**
+   * Task attempt directory.
+   * @return the task attempt dir.
+   */
+  public Path getTaskAttemptDir() {
+    return taskAttemptDir;
+  }
+
+  /**
+   * Get the job attempt number.
+   * @return the value
+   */
+  public int getJobAttemptNumber() {
+    return jobAttemptNumber;
+  }
+
+  public String getJobIdSource() {
+    return jobIdSource;
+  }
+
+  /**
+   * Get a thread local task manifest serializer.
+   * @return a serializer.
+   */
+  public JsonSerialization<TaskManifest> currentManifestSerializer() {
+    return threadLocalSerializer.get();
+  }
+
+  /**
+   * Set builder value.
+   * @param value new value
+   * @return the builder
+   */
+  public StageConfig withDeleteTargetPaths(boolean value) {
+    checkOpen();
+    deleteTargetPaths = value;
+    return this;
+  }
+
+  public boolean getDeleteTargetPaths() {
+    return deleteTargetPaths;
+  }
+
+  /**
+   * Enter the stage; calls back to
+   * {@link #enterStageEventHandler} if non-null.
+   * @param stage stage entered
+   */
+  public void enterStage(String stage) {
+    if (enterStageEventHandler != null) {
+      enterStageEventHandler.enterStage(stage);
+    }
+  }
+
+  /**
+   * Exit the stage; calls back to
+   * {@link #enterStageEventHandler} if non-null.
+   * @param stage stage entered
+   */
+  public void exitStage(String stage) {
+    if (enterStageEventHandler != null) {
+      enterStageEventHandler.exitStage(stage);
+    }
+  }
+}

+ 37 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageEventCallbacks.java

@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+/**
+ * Callbacks on stage entry/exit.
+ */
+public interface StageEventCallbacks {
+
+  /**
+   * Callback on stage entry.
+   * @param stage new stage
+   */
+  void enterStage(String stage);
+
+  /**
+   * Callback on stage exit.
+   * @param stage stage exited.
+   */
+  void exitStage(String stage);
+}

+ 214 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/TaskAttemptScanDirectoryStage.java

@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.LongSummaryStatistics;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.EntryStatus;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.util.DurationInfo;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_DIRECTORY_COUNT_MEAN;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_DIRECTORY_DEPTH_MEAN;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_FILE_COUNT_MEAN;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_FILE_SIZE_MEAN;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_SCAN_DIRECTORY;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createTaskManifest;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.maybeAddIOStatistics;
+
+/**
+ * Stage to scan a directory tree and build a task manifest.
+ * This is executed by the task committer.
+ */
+public final class TaskAttemptScanDirectoryStage
+    extends AbstractJobOrTaskStage<Void, TaskManifest> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      TaskAttemptScanDirectoryStage.class);
+
+  public TaskAttemptScanDirectoryStage(
+      final StageConfig stageConfig) {
+    super(true, stageConfig, OP_STAGE_TASK_SCAN_DIRECTORY, false);
+  }
+
+  /**
+   * Build the Manifest.
+   * @return the manifest
+   * @throws IOException failure.
+   */
+  @Override
+  protected TaskManifest executeStage(final Void arguments)
+      throws IOException {
+
+    final Path taskAttemptDir = getRequiredTaskAttemptDir();
+    final TaskManifest manifest = createTaskManifest(getStageConfig());
+
+    LOG.info("{}: scanning directory {}",
+        getName(), taskAttemptDir);
+
+    final int depth = scanDirectoryTree(manifest,
+        taskAttemptDir,
+        getDestinationDir(),
+        0, true);
+    List<FileEntry> filesToCommit = manifest.getFilesToCommit();
+    LongSummaryStatistics fileSummary = filesToCommit.stream()
+        .mapToLong(FileEntry::getSize)
+        .summaryStatistics();
+    long fileDataSize = fileSummary.getSum();
+    long fileCount = fileSummary.getCount();
+    int dirCount = manifest.getDestDirectories().size();
+    LOG.info("{}: directory {} contained {} file(s); data size {}",
+        getName(),
+        taskAttemptDir,
+        fileCount,
+        fileDataSize);
+    LOG.info("{}: Directory count = {}; maximum depth {}",
+        getName(),
+        dirCount,
+        depth);
+    // add statistics about the task output which, when aggregated, provides
+    // insight into structure of job, task skew, etc.
+    IOStatisticsStore iostats = getIOStatistics();
+    iostats.addSample(COMMITTER_TASK_DIRECTORY_COUNT_MEAN, dirCount);
+    iostats.addSample(COMMITTER_TASK_DIRECTORY_DEPTH_MEAN, depth);
+    iostats.addSample(COMMITTER_TASK_FILE_COUNT_MEAN, fileCount);
+    iostats.addSample(COMMITTER_TASK_FILE_SIZE_MEAN, fileDataSize);
+
+    return manifest;
+  }
+
+  /**
+   * Recursively scan a directory tree.
+   * The manifest will contain all files to rename
+   * (source and dest) and directories to create.
+   * All files are processed before any of the subdirs are.
+   * This helps in statistics gathering.
+   * There's some optimizations which could be done with async
+   * fetching of the iterators of those subdirs, but as this
+   * is generally off-critical path then that "enhancement"
+   * can be postponed until data suggests this needs improvement.
+   * @param manifest manifest to update
+   * @param srcDir dir to scan
+   * @param destDir destination directory
+   * @param depth depth from the task attempt dir.
+   * @param parentDirExists does the parent dir exist?
+   * @return the maximum depth of child directories
+   * @throws IOException IO failure.
+   */
+  private int scanDirectoryTree(
+      TaskManifest manifest,
+      Path srcDir,
+      Path destDir,
+      int depth,
+      boolean parentDirExists) throws IOException {
+
+    // generate some task progress in case directory scanning is very slow.
+    progress();
+
+    int maxDepth = 0;
+    int files = 0;
+    boolean dirExists = parentDirExists;
+    List<FileStatus> subdirs = new ArrayList<>();
+    try (DurationInfo ignored = new DurationInfo(LOG, false,
+        "Task Attempt %s source dir %s, dest dir %s",
+        getTaskAttemptId(), srcDir, destDir)) {
+
+      // list the directory. This may block until the listing is complete,
+      // or, if the FS does incremental or asynchronous fetching,
+      // then the next()/hasNext() call will block for the results
+      // unless turned off, ABFS does to this async
+      final RemoteIterator<FileStatus> listing = listStatusIterator(srcDir);
+
+      // when the FS (especially ABFS) does an asyn fetch of the listing,
+      // we can probe for the status of the destination dir while that
+      // page is being fetched.
+      // probe for and add the dest dir entry for all but
+      // the base dir
+
+      if (depth > 0) {
+        final EntryStatus status;
+        if (parentDirExists) {
+          final FileStatus destDirStatus = getFileStatusOrNull(destDir);
+          status = EntryStatus.toEntryStatus(destDirStatus);
+          dirExists = destDirStatus != null;
+        } else {
+          // if there is no parent dir, then there is no need to look
+          // for this directory -report it as missing automatically.
+          status = EntryStatus.not_found;
+        }
+        manifest.addDirectory(DirEntry.dirEntry(
+            destDir,
+            status,
+            depth));
+      }
+
+      // process the listing; this is where abfs will block
+      // to wait the result of the list call.
+      while (listing.hasNext()) {
+        final FileStatus st = listing.next();
+        if (st.isFile()) {
+          // this is a file, so add to the list of files to commit.
+          files++;
+          final FileEntry entry = fileEntry(st, destDir);
+          manifest.addFileToCommit(entry);
+          LOG.debug("To rename: {}", entry);
+        } else {
+          if (st.isDirectory()) {
+            // will need to scan this directory too.
+            subdirs.add(st);
+          } else {
+            // some other object. ignoring
+            LOG.info("Ignoring FS object {}", st);
+          }
+        }
+      }
+      // add any statistics provided by the listing.
+      maybeAddIOStatistics(getIOStatistics(), listing);
+    }
+
+    // now scan the subdirectories
+    LOG.debug("{}: Number of subdirectories under {} found: {}; file count {}",
+        getName(), srcDir, subdirs.size(), files);
+
+    for (FileStatus st : subdirs) {
+      Path destSubDir = new Path(destDir, st.getPath().getName());
+      final int d = scanDirectoryTree(manifest,
+          st.getPath(),
+          destSubDir,
+          depth + 1,
+          dirExists);
+      maxDepth = Math.max(maxDepth, d);
+    }
+
+    return 1 + maxDepth;
+  }
+
+}

+ 200 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/ValidateRenamedFilesStage.java

@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.OutputValidationException;
+import org.apache.hadoop.util.functional.TaskPool;
+
+import static org.apache.commons.lang3.StringUtils.isNotBlank;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_VALIDATE_OUTPUT;
+import static org.apache.hadoop.thirdparty.com.google.common.collect.Iterables.concat;
+
+/**
+ * This stage validates all files by scanning the manifests
+ * and verifying every file in every manifest is of the given size.
+ * Returns a list of all files committed.
+ *
+ * Its cost is one getFileStatus() call (parallelized) per file.
+ * Raises a {@link OutputValidationException} on a validation failure.
+ */
+public class ValidateRenamedFilesStage extends
+    AbstractJobOrTaskStage<
+        List<TaskManifest>,
+        List<FileEntry>> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      ValidateRenamedFilesStage.class);
+
+  /**
+   * Set this to halt all workers.
+   */
+  private final AtomicBoolean halt = new AtomicBoolean();
+
+  /**
+   * List of all files committed.
+   */
+  private List<FileEntry> filesCommitted = new ArrayList<>();
+
+  public ValidateRenamedFilesStage(final StageConfig stageConfig) {
+    super(false, stageConfig, OP_STAGE_JOB_VALIDATE_OUTPUT, true);
+  }
+
+  /**
+   * Get the list of files committed.
+   * @return a possibly empty list.
+   */
+  private synchronized List<FileEntry> getFilesCommitted() {
+    return filesCommitted;
+  }
+
+  /**
+   * Add a file entry to the list of committed files.
+   * @param entry entry
+   */
+  private synchronized void addFileCommitted(FileEntry entry) {
+    filesCommitted.add(entry);
+  }
+
+  /**
+   * Validate the task manifests.
+   * This is done by listing all the directories
+   * and verifying that every file in the source list
+   * has a file in the destination of the same size.
+   * If two tasks have both written the same file or
+   * a source file was changed after the task was committed,
+   * then a mistmatch will be detected -provided the file
+   * length is now different.
+   * @param taskManifests list of manifests.
+   * @return list of files committed.
+   */
+  @Override
+  protected List<FileEntry> executeStage(
+      final List<TaskManifest> taskManifests)
+      throws IOException {
+
+    // set the list of files to be as big as the number of tasks.
+    // synchronized to stop complaints.
+    synchronized (this) {
+      filesCommitted = new ArrayList<>(taskManifests.size());
+    }
+
+    // validate all the files.
+
+    final Iterable<FileEntry> filesToCommit = concat(taskManifests.stream()
+        .map(TaskManifest::getFilesToCommit)
+        .collect(Collectors.toList()));
+
+    TaskPool.foreach(filesToCommit)
+        .executeWith(getIOProcessors())
+        .stopOnFailure()
+        .run(this::validateOneFile);
+
+    return getFilesCommitted();
+  }
+
+  /**
+   * Validate a file.
+   * @param entry entry to probe for
+   * @throws IOException IO problem.
+   * @throws OutputValidationException if the entry is not valid
+   */
+  private void validateOneFile(FileEntry entry) throws IOException {
+    updateAuditContext(OP_STAGE_JOB_VALIDATE_OUTPUT);
+
+    if (halt.get()) {
+      // told to stop
+      return;
+    }
+    // report progress back
+    progress();
+    // look validate the file.
+    // raising an FNFE if the file isn't there.
+    FileStatus destStatus;
+    final Path sourcePath = entry.getSourcePath();
+    Path destPath = entry.getDestPath();
+    try {
+      destStatus = getFileStatus(destPath);
+
+      // it must be a file
+      if (!destStatus.isFile()) {
+        throw new OutputValidationException(destPath,
+            "Expected a file renamed from " + sourcePath
+                + "; found " + destStatus);
+      }
+      final long sourceSize = entry.getSize();
+      final long destSize = destStatus.getLen();
+
+      // etags, if the source had one.
+      final String sourceEtag = entry.getEtag();
+      if (isNotBlank(sourceEtag)) {
+        final String destEtag = ManifestCommitterSupport.getEtag(destStatus);
+        if (!sourceEtag.equals(destEtag)) {
+          LOG.warn("Etag of dest file {}: {} does not match that of manifest entry {}",
+              destPath, destStatus, entry);
+          throw new OutputValidationException(destPath,
+              String.format("Expected the file"
+                      + " renamed from %s"
+                      + " with etag %s and length %s"
+                      + " but found a file with etag %s and length %d",
+                  sourcePath,
+                  sourceEtag,
+                  sourceSize,
+                  destEtag,
+                  destSize));
+
+        }
+      }
+      // check the expected length after any etag validation
+      if (destSize != sourceSize) {
+        LOG.warn("Length of dest file {}: {} does not match that of manifest entry {}",
+            destPath, destStatus, entry);
+        throw new OutputValidationException(destPath,
+            String.format("Expected the file"
+                    + " renamed from %s"
+                    + " with length %d"
+                    + " but found a file of length %d",
+                sourcePath,
+                sourceSize,
+                destSize));
+      }
+
+    } catch (FileNotFoundException e) {
+      // file didn't exist
+      throw new OutputValidationException(destPath,
+          "Expected a file, but it was not found", e);
+    }
+    addFileCommitted(entry);
+  }
+
+}

+ 30 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/package-info.java

@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Intermediate manifest committer.
+ *
+ * The individual and aggregate stages of the task and job commit process.
+ */
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;

+ 19 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml

@@ -2231,4 +2231,23 @@
   </description>
   </description>
 </property>
 </property>
 
 
+<!-- not yet enabled by default.
+
+<property>
+  <name>mapreduce.outputcommitter.factory.scheme.abfs</name>
+  <value>org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory</value>
+  <description>
+    The default committer factory for ABFS is for the manifest committer with
+    abfs-specific tuning.
+  </description>
+</property>
+
+<property>
+  <name>mapreduce.outputcommitter.factory.scheme.gs</name>
+  <value>org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory</value>
+  <description>
+    The default committer factory for google cloud storage is for the manifest committer.
+  </description>
+</property>
+-->
 </configuration>
 </configuration>

+ 605 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md

@@ -0,0 +1,605 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+
+# The Manifest Committer for Azure and Google Cloud Storage
+
+This document how to use the _Manifest Committer_.
+
+The _Manifest_ committer is a committer for work which provides
+performance on ABFS for "real world" queries,
+and performance and correctness on GCS.
+It also works with other filesystems, including HDFS.
+However, the design is optimized for object stores where
+listing operatons are slow and expensive.
+
+The architecture and implementation of the committer is covered in
+[Manifest Committer Architecture](manifest_committer_architecture.html).
+
+
+The protocol and its correctness are covered in
+[Manifest Committer Protocol](manifest_committer_protocol.html).
+
+It was added in March 2022, and should be considered unstable
+in early releases.
+
+<!-- MACRO{toc|fromDepth=0|toDepth=2} -->
+
+## Problem:
+
+The only committer of work from Spark to Azure ADLS Gen 2 "abfs://" storage
+which is safe to use is the "v1 file committer".
+
+This is "correct" in that if a task attempt fails, its output is guaranteed not
+to be included in the final out. The "v2" commit algorithm cannot meet that
+guarantee, which is why it is no longer the default.
+
+But: it is slow, especially on jobs where deep directory trees of output are used.
+Why is it slow? It's hard to point at a particular cause, primarily because of
+the lack of any instrumentation in the `FileOutputCommitter`.
+Stack traces of running jobs generally show `rename()`, though list operations
+do surface too.
+
+On Google GCS, neither the v1 nor v2 algorithm are _safe_ because the google
+filesystem doesn't have the atomic directory rename which the v1 algorithm
+requires.
+
+A further issue is that both Azure and GCS storage may encounter scale issues
+with deleting directories with many descendants.
+This can trigger timeouts because the FileOutputCommitter assumes that
+cleaning up after the job is a fast call to `delete("_temporary", true)`.
+
+## Solution.
+
+The _Intermediate Manifest_ committer is a new committer for
+work which should deliver performance on ABFS
+for "real world" queries, and performance and correctness on GCS.
+
+This committer uses the extension point which came in for the S3A committers.
+Users can declare a new committer factory for abfs:// and gcs:// URLs.
+A suitably configured spark deployment will pick up the new committer.
+
+Directory performance issues in job cleanup can be addressed by two options
+1. The committer will parallelize deletion of task attempt directories before
+   deleting the `_temporary` directory.
+1. Cleanup can be disabled. .
+
+The committer can be used with any filesystem client which has a "real" file rename()
+operation.
+It has been optimised for remote object stores where listing and file probes
+are expensive -the design is less likely to offer such signifcant speedup
+on HDFS -though the parallel renaming operations will speed up jobs
+there compared to the classic v1 algorithm.
+
+# <a name="how"></a> How it works
+
+The full details are covered in [Manifest Committer Architecture](manifest_committer_architecture.html).
+
+# <a name="use"></a> Using the committer
+
+The hooks put in to support the S3A committers were designed to allow every
+filesystem schema to provide their own committer.
+See [Switching To an S3A Committer](../../hadoop-aws/tools/hadoop-aws/committers.html#Switching_to_an_S3A_Committer)
+
+A factory for the abfs schema would be defined in
+`mapreduce.outputcommitter.factory.scheme.abfs` ; and a similar one for `gcs`.
+
+Some matching spark configuration changes, especially for parquet binding, will be required.
+These can be done in `core-site.xml`, if it is not defined in the `mapred-default.xml` JAR.
+
+
+```xml
+<property>
+  <name>mapreduce.outputcommitter.factory.scheme.abfs</name>
+  <value>org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory</value>
+</property>
+<property>
+  <name>mapreduce.outputcommitter.factory.scheme.gs</name>
+  <value>org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory</value>
+</property>
+```
+
+## Binding to the manifest committer in Spark.
+
+In Apache Spark, the configuration can be done either with command line options (after the '--conf') or by using the `spark-defaults.conf` file. The following is an example of using `spark-defaults.conf` also including the configuration for Parquet with a subclass of the parquet
+committer which uses the factory mechansim internally.
+
+```
+spark.hadoop.mapreduce.outputcommitter.factory.scheme.abfs org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory
+spark.hadoop.mapreduce.outputcommitter.factory.scheme.gs org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory
+spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
+spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
+```
+
+
+### <a name="committerinfo"></a> Using the Cloudstore `committerinfo` command to probe committer bindings.
+
+The hadoop committer settings can be validated in a recent build of [cloudstore](https://github.com/steveloughran/cloudstore)
+and its `committerinfo` command.
+This command instantiates a committer for that path through the same factory mechanism as MR and spark jobs use,
+then prints its `toString` value.
+
+```
+hadoop jar cloudstore-1.0.jar committerinfo abfs://testing@ukwest.dfs.core.windows.net/
+
+2021-09-16 19:42:59,731 [main] INFO  commands.CommitterInfo (StoreDurationInfo.java:<init>(53)) - Starting: Create committer
+Committer factory for path abfs://testing@ukwest.dfs.core.windows.net/ is
+ org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory@3315d2d7
+  (classname org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory)
+2021-09-16 19:43:00,897 [main] INFO  manifest.ManifestCommitter (ManifestCommitter.java:<init>(144)) - Created ManifestCommitter with
+   JobID job__0000, Task Attempt attempt__0000_r_000000_1 and destination abfs://testing@ukwest.dfs.core.windows.net/
+Created committer of class org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitter:
+ ManifestCommitter{ManifestCommitterConfig{destinationDir=abfs://testing@ukwest.dfs.core.windows.net/,
+   role='task committer',
+   taskAttemptDir=abfs://testing@ukwest.dfs.core.windows.net/_temporary/manifest_job__0000/0/_temporary/attempt__0000_r_000000_1,
+   createJobMarker=true,
+   jobUniqueId='job__0000',
+   jobUniqueIdSource='JobID',
+   jobAttemptNumber=0,
+   jobAttemptId='job__0000_0',
+   taskId='task__0000_r_000000',
+   taskAttemptId='attempt__0000_r_000000_1'},
+   iostatistics=counters=();
+
+gauges=();
+
+minimums=();
+
+maximums=();
+
+means=();
+}
+
+```
+
+
+## Verifying that the committer was used
+
+The new committer will write a JSON summary of the operation, including statistics, in the `_SUCCESS` file.
+
+If this file exists and is zero bytes long: the classic `FileOutputCommitter` was used.
+
+If this file exists and is greater than zero bytes long, either the manifest committer was used,
+or in the case of S3A filesystems, one of the S3A committers. They all use the same JSON format.
+
+# <a name="configuration"></a> Configuration options
+
+Here are the main configuration options of the committer.
+
+
+| Option | Meaning | Default Value |
+|--------|---------|---------------|
+| `mapreduce.manifest.committer.delete.target.files` | Delete target files? | `false` |
+| `mapreduce.manifest.committer.io.threads` | Thread count for parallel operations | `64` |
+| `mapreduce.manifest.committer.summary.report.directory` | directory to save reports. | `""` |
+| `mapreduce.manifest.committer.cleanup.parallel.delete` | Delete temporary directories in parallel | `true` |
+| `mapreduce.fileoutputcommitter.cleanup.skipped` | Skip cleanup of `_temporary` directory| `false` |
+| `mapreduce.fileoutputcommitter.cleanup-failures.ignored` | Ignore errors during cleanup | `false` |
+| `mapreduce.fileoutputcommitter.marksuccessfuljobs` | Create a `_SUCCESS` marker file on successful completion. (and delete any existing one in job setup) | `true` |
+
+There are some more, as covered in the (Advanced)[#advanced] section.
+
+
+## <a name="scaling"></a> Scaling jobs `mapreduce.manifest.committer.io.threads`
+
+The core reason that this committer is faster than the classic `FileOutputCommitter`
+is that it tries to parallelize as much file IO as it can during job commit, specifically:
+
+* task manifest loading
+* deletion of files where directories will be created
+* directory creation
+* file-by-file renaming
+* deletion of task attempt directories in job cleanup
+
+These operations are all performed in the same thread pool, whose size is set
+in the option `mapreduce.manifest.committer.io.threads`.
+
+Larger values may be used.
+
+XML
+```xml
+<property>
+  <name>mapreduce.manifest.committer.io.threads</name>
+  <value>200</value>
+</property>
+```
+
+spark-defaults.conf
+```
+spark.hadoop.mapreduce.manifest.committer.io.threads 200
+```
+
+A larger value than that of the number of cores allocated to
+the MapReduce AM or Spark Driver does not directly overload
+the CPUs, as the threads are normally waiting for (slow) IO
+against the object store/filesystem to complete.
+
+Caveats
+* In Spark, multiple jobs may be committed in the same process,
+  each of which will create their own thread pool during job
+  commit or cleanup.
+* Azure rate throttling may be triggered if too many IO requests
+  are made against the store. The rate throttling option
+  `mapreduce.manifest.committer.io.rate` can help avoid this.
+
+
+## <a name="deleting"></a> Optional: deleting target files in Job Commit
+
+The classic `FileOutputCommitter` deletes files at the destination paths
+before renaming the job's files into place.
+
+This is optional in the manifest committers, set in the option
+`mapreduce.manifest.committer.delete.target.files` with a default value of `false`.
+
+This increases performance and is safe to use when all files created by a job
+have unique filenames.
+
+Apache Spark does generate unique filenames for ORC and Parquet since
+[SPARK-8406](https://issues.apache.org/jira/browse/SPARK-8406)
+_Adding UUID to output file name to avoid accidental overwriting_
+
+Avoiding checks for/deleting target files saves one delete call per file being committed, so can
+save a significant amount of store IO.
+
+When appending to existing tables, using formats other than ORC and parquet,
+unless confident that unique identifiers
+are added to each filename, enable deletion of the target files.
+
+```
+spark.hadoop.mapreduce.manifest.committer.delete.target.files true
+```
+
+*Note 1:* the committer will skip deletion operations when it
+created the directory into which a file is to be renamed.
+This makes it slightly more efficient, at least if jobs
+appending data are creating and writing into new partitions.
+
+*Note 2:* the committer still requires tasks within a single
+job to create unique files. This is foundational for
+any job to generate correct data.
+
+# <a name="SUCCESS"></a> Job Summaries in `_SUCCESS` files
+
+The original hadoop committer creates a zero byte `_SUCCESS` file in the root of the output directory
+unless disabled.
+
+This committer writes a JSON summary which includes
+* The name of the committer.
+* Diagnostics information.
+* A list of some of the files created (for testing; a full list is excluded as it can get big).
+* IO Statistics.
+
+If, after running a query, this `_SUCCESS` file is zero bytes long,
+*the new committer has not been used*
+
+If it is not empty, then it can be examined.
+
+## <a name="printer"></a> Viewing `_SUCCESS` file files through the `ManifestPrinter` tool.
+
+The summary files are JSON, and can be viewed in any text editor.
+
+For a more succinct summary, including better display of statistics, use the `ManifestPrinter` tool.
+
+```bash
+hadoop org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestPrinter <path>
+```
+
+This works for the files saved at the base of an output directory, and
+any reports saved to a report directory.
+
+## <a name="summaries"></a> Collecting Job Summaries `mapreduce.manifest.committer.summary.report.directory`
+
+The committer can be configured to save the `_SUCCESS` summary files to a report directory,
+irrespective of whether the job succeed or failed, by setting a fileystem path in
+the option `mapreduce.manifest.committer.summary.report.directory`.
+
+The path does not have to be on the same
+store/filesystem as the destination of work. For example, a local fileystem could be used.
+
+XML
+
+```xml
+<property>
+  <name>mapreduce.manifest.committer.summary.report.directory</name>
+  <value>file:///tmp/reports</value>
+</property>
+```
+
+spark-defaults.conf
+
+```
+spark.hadoop.mapreduce.manifest.committer.summary.report.directory file:///tmp/reports
+```
+
+This allows for the statistics of jobs to be collected irrespective of their outcome, Whether or not
+saving the `_SUCCESS` marker is enabled, and without problems caused by a chain of queries
+overwriting the markers.
+
+
+# <a name="cleanup"></a> Cleanup
+
+Job cleanup is convoluted as it is designed to address a number of issues which
+may surface in cloud storage.
+
+* Slow performance for deletion of directories.
+* Timeout when deleting very deep and wide directory trees.
+* General resilience to cleanup issues escalating to job failures.
+
+
+| Option | Meaning | Default Value |
+|--------|---------|---------------|
+| `mapreduce.fileoutputcommitter.cleanup.skipped` | Skip cleanup of `_temporary` directory| `false` |
+| `mapreduce.fileoutputcommitter.cleanup-failures.ignored` | Ignore errors during cleanup | `false` |
+| `mapreduce.manifest.committer.cleanup.parallel.delete` | Delete task attempt directories in parallel | `true` |
+
+The algorithm is:
+
+```
+if `mapreduce.fileoutputcommitter.cleanup.skipped`:
+  return
+if `mapreduce.manifest.committer.cleanup.parallel.delete`:
+  attempt parallel delete of task directories; catch any exception
+if not `mapreduce.fileoutputcommitter.cleanup.skipped`:
+  delete(`_temporary`); catch any exception
+if caught-exception and not `mapreduce.fileoutputcommitter.cleanup-failures.ignored`:
+  throw caught-exception
+```
+
+It's a bit complicated, but the goal is to perform a fast/scalable delete and
+throw a meaningful exception if that didn't work.
+
+When working with ABFS and GCS, these settings should normally be left alone.
+If somehow errors surface during cleanup, enabling the option to
+ignore failures will ensure the job still completes.
+Disabling cleanup even avoids the overhead of cleanup, but
+requires a workflow or manual operation to clean up all
+`_temporary` directories on a regular basis.
+
+
+# <a name="abfs"></a> Working with Azure ADLS Gen2 Storage
+
+To switch to the manifest committer, the factory for committers for destinations with `abfs://` URLs must
+be switched to the manifest committer factory, either for the application or
+the entire cluster.
+
+```xml
+<property>
+  <name>mapreduce.outputcommitter.factory.scheme.abfs</name>
+  <value>org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory</value>
+</property>
+```
+
+This allows for ADLS Gen2 -specific performance and consistency logic to be used from within the committer.
+In particular:
+* the [Etag](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag) header
+can be collected in listings and used in the job commit phase.
+* IO rename operations are rate limited
+* recovery is attempted when throttling triggers rename failures.
+
+*Warning* This committer is not compatible with older Azure storage services
+(WASB or ADLS Gen 1).
+
+The core set of Azure-optimized options becomes
+
+```xml
+<property>
+  <name>mapreduce.outputcommitter.factory.scheme.abfs</name>
+  <value>org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory</value>
+</property>
+
+<property>
+  <name>spark.hadoop.fs.azure.io.rate.limit</name>
+  <value>10000</value>
+</property>
+```
+
+And optional settings for debugging/performance analysis
+
+```xml
+<property>
+  <name>mapreduce.manifest.committer.summary.report.directory</name>
+  <value>abfs:// Path within same store/separate store</value>
+  <description>Optional: path to where job summaries are saved</description>
+</property>
+```
+
+## <a name="abfs-options"></a> Full set of ABFS options for spark
+
+```
+spark.hadoop.mapreduce.outputcommitter.factory.scheme.abfs org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory
+spark.hadoop.fs.azure.io.rate.limit 10000
+spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
+spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
+
+spark.hadoop.mapreduce.manifest.committer.summary.report.directory  (optional: URI of a directory for job summaries)
+```
+
+## Experimental: ABFS Rename Rate Limiting `fs.azure.io.rate.limit`
+
+To avoid triggering store throttling and backoff delays, as well as other
+throttling-related failure conditions file renames during job commit
+are throttled through a "rate limiter" which limits the number of
+rename operations per second a single instance of the ABFS FileSystem client
+may issue.
+
+| Option | Meaning |
+|--------|---------|
+| `fs.azure.io.rate.limit` | Rate limit in operations/second for IO operations. |
+
+Set the option to `0` remove all rate limiting.
+
+The default value of this is set to 10000, which is the default IO capacity for
+an ADLS storage account.
+
+```xml
+<property>
+  <name>fs.azure.io.rate.limit</name>
+  <value>10000</value>
+  <description>maximum number of renames attempted per second</description>
+</property>
+```
+
+This capacity is set at the level of the filesystem client, and so not
+shared across all processes within a single application, let
+alone other applications sharing the same storage account.
+
+It will be shared with all jobs being committed by the same
+Spark driver, as these do share that filesystem connector.
+
+If rate limiting is imposed, the statistic `store_io_rate_limited` will
+report the time to acquire permits for committing files.
+
+If server-side throttling took place, signs of this can be seen in
+* The store service's logs and their throttling status codes (usually 503 or 500).
+* The job statistic `commit_file_rename_recovered`. This statistic indicates that
+  ADLS throttling manifested as failures in renames, failures which were recovered
+  from in the comitter.
+
+If these are seen -or other applications running at the same time experience
+throttling/throttling-triggered problems, consider reducing the value of
+`fs.azure.io.rate.limit`, and/or requesting a higher IO capacity from Microsoft.
+
+*Important* if you do get extra capacity from Microsoft and you want to use
+it to speed up job commits, increase the value of `fs.azure.io.rate.limit`
+either across the cluster, or specifically for those jobs which you wish
+to allocate extra priority to.
+
+This is still a work in progress; it may be expanded to support
+all IO operations performed by a single filesystem instance.
+
+# <a name="gcs"></a> Working with Google Cloud Storage
+
+The manifest committer is compatible with and tested against Google cloud storage through
+the gcs-connector library from google, which provides a Hadoop filesystem client for the
+schema `gs`.
+
+Google cloud storage has the semantics needed for the commit protocol
+to work safely.
+
+The Spark settings to switch to this committer are
+
+```
+spark.hadoop.mapreduce.outputcommitter.factory.scheme.gs org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory
+spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
+spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
+
+spark.hadoop.mapreduce.manifest.committer.summary.report.directory  (optional: URI of a directory for job summaries)
+```
+
+The store's directory delete operations are `O(files)` so the value
+of `mapreduce.manifest.committer.cleanup.parallel.delete`
+SHOULD be left at the default of `true`.
+
+For mapreduce, declare the binding in `core-site.xml`or `mapred-site.xml`
+```xml
+<property>
+  <name>mapreduce.outputcommitter.factory.scheme.gcs</name>
+  <value>org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory</value>
+</property>
+```
+
+
+# <a name="hdfs"></a> Working with HDFS
+
+This committer _does_ work with HDFS, it has just been targeted at object stores with
+reduced performance on some operations, especially listing and renaming,
+and semantics too reduced for the classic `FileOutputCommitter` to rely on
+(specifically GCS).
+
+To use on HDFS, set the `ManifestCommitterFactory` as the committer factory for `hdfs://` URLs.
+
+Because HDFS does fast directory deletion, there is no need to parallelize deletion
+of task attempt directories during cleanup, so set
+`mapreduce.manifest.committer.cleanup.parallel.delete` to `false`
+
+The final spark bindings becomes
+
+```
+spark.hadoop.mapreduce.outputcommitter.factory.scheme.hdfs org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory
+spark.hadoop.mapreduce.manifest.committer.cleanup.parallel.delete false
+spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
+spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
+
+spark.hadoop.mapreduce.manifest.committer.summary.report.directory  (optional: URI of a directory for job summaries)
+```
+
+# <a name="advanced"></a> Advanced Topics
+
+## Advanced Configuration options
+
+There are some advanced options which are intended for development and testing,
+rather than production use.
+
+| Option | Meaning | Default Value |
+|--------|---------|---------------|
+| `mapreduce.manifest.committer.store.operations.classname` | Classname for Manifest Store Operations | `""` |
+| `mapreduce.manifest.committer.validate.output` | Perform output validation? | `false` |
+
+
+## Validating output  `mapreduce.manifest.committer.validate.output`
+
+The option `mapreduce.manifest.committer.validate.output` triggers a check of every renamed file to
+verify it has the expected length.
+
+This adds the overhead of a `HEAD` request per file, and so is recommended for testing only.
+
+There is no verification of the actual contents.
+
+## Controlling storage integration `mapreduce.manifest.committer.store.operations.classname`
+
+The manifest committer interacts with filesystems through implementations of the interface
+`ManifestStoreOperations`.
+It is possible to provide custom implementations for store-specific features.
+There is one of these for ABFS; when the abfs-specific committer factory is used this
+is automatically set.
+
+It can be explicitly set.
+```xml
+<property>
+  <name>mapreduce.manifest.committer.store.operations.classname</name>
+  <value>org.apache.hadoop.fs.azurebfs.commit.AbfsManifestStoreOperations</value>
+</property>
+```
+
+The default implementation may also be configured.
+
+```xml
+<property>
+  <name>mapreduce.manifest.committer.store.operations.classname</name>
+  <value>org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperationsThroughFileSystem</value>
+</property>
+```
+
+There is no need to alter these values, except when writing new implementations for other stores,
+something which is only needed if the store provides extra integration support for the
+committer.
+
+## <a name="concurrent"></a> Support for concurrent test runs.
+
+It *may* be possible to run multiple jobs targeting the same directory tree.
+
+For this to work, a number of conditions must be met:
+
+* When using spark, unique job IDs must be set. This meangs the Spark distribution
+  MUST contain the patches for
+  [SPARK-33402](https://issues.apache.org/jira/browse/SPARK-33402)
+  and
+  [SPARK-33230](https://issues.apache.org/jira/browse/SPARK-33230).
+* Cleanup of the `_temporary` directory must be disabled by setting
+  `mapreduce.fileoutputcommitter.cleanup.skipped` to `true`.
+* All jobs/tasks must create files with unique filenames.
+* All jobs must create output with the same directory partition structure.
+* Remember to delete the `_temporary` directory later!
+
+This has *NOT BEEN TESTED*

+ 335 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_architecture.md

@@ -0,0 +1,335 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+
+# Manifest Committer Architecture
+
+This document describes the architecture and other implementation/correctness
+aspects of the [Manifest Committer](manifest_committer.html)
+
+The protocol and its correctness are covered in [Manifest Committer Protocol](manifest_committer_protocol.html).
+<!-- MACRO{toc|fromDepth=0|toDepth=2} -->
+
+The _Manifest_ committer is a committer for work which provides performance on ABFS for "real world"
+queries, and performance and correctness on GCS.
+
+This committer uses the extension point which came in for the S3A committers.
+Users can declare a new committer factory for `abfs://` and `gcs://` URLs.
+It can be used through Hadoop MapReduce and Apache Spark.
+
+## Background
+
+### Terminology
+
+| Term | Meaning|
+|------|--------|
+| Committer |  A class which can be invoked by MR/Spark to perform the task and job commit operations. |
+| Spark Driver | The spark process scheduling the work and choreographing the commit operation.|
+| Job  | In MapReduce. the entire application. In spark, this is a single stage in a chain of work |
+| Job Attempt | A single attempt at a job. MR supports multiple Job attempts with recovery on partial job failure. Spark says "start again from scratch" |
+| Task | a subsection of a job, such as processing one file, or one part of a file |
+| Task ID |  ID of the task, unique within this job. Usually starts at 0 and is used in filenames (part-0000, part-001, etc.) |
+| Task attempt (TA) | An attempt to perform a task. It may fail, in which case MR/spark will schedule another. |
+| Task Attempt ID | A unique ID for the task attempt. The Task ID + an attempt counter.|
+|  Destination directory | The final destination of work.|
+| Job Attempt Directory | A temporary directory used by the job attempt. This is always _underneath_ the destination directory, so as to ensure it is in the same encryption zone as HDFS, storage volume in other filesystems, etc.|
+| Task Attempt directory | (also known as "Task Attempt Working Directory"). Directory exclusive for each task attempt under which files are written |
+| Task Commit | Taking the output of a Task Attempt and making it the final/exclusive result of that "successful" Task.|
+| Job Commit | aggregating all the outputs of all committed tasks and producing the final results of the job. |
+
+
+
+The purpose of a committer is to ensure that the complete output of
+a job ends up in the destination, even in the presence of failures of tasks.
+
+- _Complete:_ the output includes the work of all successful tasks.
+- _Exclusive:_ the output of unsuccessful tasks is not present.
+- _Concurrent:_ When multiple tasks are committed in parallel the output is the same as when
+  the task commits are serialized. This is not a requirement of Job Commit.
+- _Abortable:_  jobs and tasks may be aborted prior to job commit, after which their output is not visible.
+- _Continuity of correctness:_ once a job is committed, the output of any failed,
+  aborted, or unsuccessful task MUST NO appear at some point in the future.
+
+For Hive's classic hierarchical-directory-structured tables, job committing
+requires the output of all committed tasks to be put into the correct location
+in the directory tree.
+
+The committer built into `hadoop-mapreduce-client-core` module is the `FileOutputCommitter`.
+
+
+
+## The Manifest Committer: A high performance committer for Spark on Azure and Google storage.
+
+The Manifest Committer is a higher performance committer for ABFS and GCS storage
+for jobs which create file across deep directory trees through many tasks.
+
+It will also work on `hdfs://` and indeed, `file://` URLs, but
+it is optimized to address listing and renaming performance and throttling
+issues in cloud storage.
+
+It *will not* work correctly with S3, because it relies on an atomic rename-no-overwrite
+operation to commit the manifest file. It will also have the performance
+problems of copying rather than moving all the generated data.
+
+Although it will work with MapReduce
+there is no handling of multiple job attempts with recovery from previous failed
+attempts.
+
+### The Manifest
+
+A Manifest file is designed which contains (along with IOStatistics and some
+other things)
+
+1. A list of destination directories which must be created if they do not exist.
+1. A list of files to rename, recorded as (absolute source, absolute destination,
+   file-size) entries.
+
+### Task Commit
+
+Task attempts are committed by:
+
+1. Recursively listing the task attempt working dir to build
+   1. A list of directories under which files are renamed.
+   2. A list of files to rename: source, destination, size and optionally, etag.
+2. Saving this information in a manifest file in the job attempt directory with
+   a filename derived from the Task ID.
+   Note: writing to a temp file and then renaming to the final path will be used
+   to ensure the manifest creation is atomic.
+
+
+No renaming takes place —the files are left in their original location.
+
+The directory treewalk is single-threaded, then it is `O(directories)`,
+with each directory listing using one or more paged LIST calls.
+
+This is simple, and for most tasks, the scan is off the critical path of of the job.
+
+Statistics analysis may justify moving to parallel scans in future.
+
+
+### Job Commit
+
+Job Commit consists of:
+
+1. List all manifest files in the job attempt directory.
+1. Load each manifest file, create directories which do not yet exist, then
+   rename each file in the rename list.
+1. Save a JSON `_SUCCESS` file with the same format as the S3A committer (for
+   testing; use write and rename for atomic save)
+
+The job commit phase supports parallelization for many tasks and many files
+per task, specifically:
+
+1. Manifest tasks are loaded and processed in a pool of "manifest processor"
+   threads.
+2. Directory creation and file rename operations are each processed in a pool of "
+   executor" threads: many renames can execute in parallel as they use minimal
+   network IO.
+3. job cleanup can parallelize deletion of task attempt directories. This
+   is relevant as directory deletion is `O(files)` on Google cloud storage,
+   and also on ABFS when OAuth authentication is used.
+
+
+### Ancestor directory preparation
+
+Optional scan of all ancestors ...if any are files, delete.
+
+
+### Parent directory creation
+
+1. Probe shared directory map for directory existing. If found: operation is
+   complete.
+1. if the map is empty, call `getFileStatus()` on the path. Not found: create
+   directory, add entry and those of all parent paths Found and is directory:
+   add entry and those of all parent paths Found and is file: delete. then
+   create as before.
+
+Efficiently handling concurrent creation of directories (or delete+create) is going to be a
+troublespot; some effort is invested there to build the set of directories to
+create.
+
+### File Rename
+
+Files are renamed in parallel.
+
+A pre-rename check for anything being at that path (and deleting it) will be optional.
+With spark creating new UUIDs for each file, this isn't going to happen, and
+saves HTTP requests.
+
+
+### Validation
+
+Optional scan of all committed files and verify length and, if known,
+etag. For testing and diagnostics.
+
+## Benefits
+
+* Pushes the source tree list operations into the task commit phase, which is
+  generally off the critical path of execution
+* Provides an atomic task commit to GCS, as there is no expectation that
+  directory rename is atomic
+* It is possible to pass IOStatistics from workers in manifest.
+* Allows for some pre-rename operations similar to the S3A "Partitioned Staging
+  committer". This can be configured to delete all existing entries in
+  directories scheduled to be created -or fail if those partitions are
+  non-empty.
+  See [Partitioned Staging Committer](../../hadoop-aws/tools/hadoop-aws/committers.html#The_.E2.80.9CPartitioned.E2.80.9D_Staging_Committer)
+* Allows for an optional preflight validation check (verify no duplicate files created by different tasks)
+* Manifests can be viewed, size of output determined, etc, during
+  development/debugging.
+
+### Disadvantages
+
+* Needs a new manifest file format.
+* May makes task commit more complex.
+
+This solution is necessary for GCS and should be beneficial on ABFS as listing
+overheads are paid for in the task committers.
+
+# Implementation Details
+
+### Constraints
+
+A key goal is to keep the manifest committer isolated and neither
+touch the existing committer code nor other parts of the hadoop codebase.
+
+It must plug directly into MR and Spark without needing any changes
+other than already implemented for the S3A Committers
+
+* Self-contained: MUST NOT require changes to hadoop-common, etc.
+* Isolated: MUST NOT make changes to existing committers
+* Integrated: MUST bind via `PathOutputCommitterFactory`.
+
+As a result of this there's a bit of copy and paste from elsewhere,
+e.g. `org.apache.hadoop.util.functional.TaskPool`
+is based on S3ACommitter's `org.apache.hadoop.fs.s3a.commit.Tasks`.
+
+The` _SUCCESS` file MUST be compatible with the S3A JSON file.
+This is to ensure any existing test suites which validate
+S3A committer output can be retargeted at jobs executed
+by the manifest committer without any changes.
+
+
+#### Progress callbacks in job commit.
+
+When? Proposed: heartbeat until renaming finally finishes.
+
+#### Error handling and aborting in job commit.
+
+We would want to stop the entire job commit. Some atomic boolean "abort job"
+would need to be checked in the processing of each task committer thread's
+iteraton through a directory (or processing of each file?)
+Failures in listing or renaming will need to be escalated to halting the entire
+job commit. This implies that any IOE raised in asynchronous rename operation or
+in a task committer thread must:
+
+1. be caught
+1. be stored in a shared field/variable
+1. trigger the abort
+1. be rethrown at the end of the `commitJob()` call
+
+#### Avoiding deadlocks
+
+If a job commit stage is using a thread pool for per-task operations, e.g. loading
+files, that same thread pool MUST NOT be used for parallel operations within
+the per-task stage.
+
+As every `JobStage` is executed in sequence within task or job commit, it
+is safe to share the same thread pool across stages.
+
+In the current implementation, there is no parallel "per manifest" operation
+in job commit other than for actually loading the files.
+The operations to create directories and to rename files are actually
+performed without performing parallel processing of individual manifests.
+
+Directory Preparation: merge the directory lists of all manifests,
+then queue for creation the (hopefully very much smaller) set of unique
+directories.
+
+Rename: iterate through all manifests and queue their renames into a pool for
+renaming.
+
+#### Thread pool lifetimes
+
+The lifespan of thread pools is constrained to that of the stage configuration,
+which will be limited to within each of the `PathOutputCommitter` methods
+to setup, commit, abort and cleanup.
+
+This avoids the thread pool lifecycle problems of the S3A Committers.
+
+#### Scale issues similar to S3A HADOOP-16570.
+
+This was a failure in terasorting where many tasks each generated many files;
+the full list of files to commit (and the etag of every block) was built up in memory and
+validated prior to execution.
+
+The manifest committer assumes that the amount of data being stored in memory is less,
+because there is no longer the need to store an etag for every block of every
+file being committed.
+
+
+#### Duplicate creation of directories in the dest dir
+
+Combine all lists of directories to create and eliminate duplicates.
+
+## Implementation Architecture
+
+The implementation architecture reflects lessons from the S3A Connector.
+
+* Isolate the commit stages from the MR commit class, as that's got a complex lifecycle.
+* Instead, break up into series of _stages_ which can be tested in isolation
+  and chained to provided the final protocol.
+* Don't pass in MR data types (taskID etc) down into the stages -pass down a configuration
+  with general types (string etc).
+* Also pass in a callback for store operations, for ease of implementing a fake store.
+* For each stage: define preconditions and postconditions, failure modes. Test in isolation.
+
+#### Statistics
+
+The committer collects duration statistics on all the operations it performs/invokes
+against filesystems.
+* Those collected during task commit are saved to the manifest (excluding the time to
+save and rename that file)
+* When these manifests are loaded during job commit, these statistics are merged to
+form aggregate statistics of the whole job.
+* Which are saved to the `_SUCCESS` file
+* and to any copy of that file in the directory specified by
+  `mapreduce.manifest.committer.summary.report.directory`, if set.
+  to be saved.
+* The class `org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestPrinter`
+  can load and print these.
+
+IO statistics from filsystems and input and output streams used in a query are not
+collected.
+
+
+## Auditing
+
+When invoking the `ManifestCommitter` via the `PathOutputCommitter` API, the following
+attributes are added to the active (thread) context
+
+| Key   | Value           |
+|-------|-----------------|
+| `ji`  | Job ID          |
+| `tai` | Task Attempt ID |
+| `st`  | Stage           |
+
+These are also all set in all the helper threads performing work
+as part of a stage's execution.
+
+Any store/FS which supports auditing is able to collect this data
+and include in their logs.
+
+To ease backporting, all audit integration is in the single class
+`org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration`.

+ 617 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_protocol.md

@@ -0,0 +1,617 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+
+# Manifest Committer Protocol
+
+This document describes the commit protocol
+ of the [Manifest Committer](manifest_committer.html)
+
+<!-- MACRO{toc|fromDepth=0|toDepth=2} -->
+
+## Background
+
+### Terminology
+
+| Term | Meaning|
+|------|--------|
+| Committer |  A class which can be invoked by MR Spark to perform the task and job commit operations. |
+| Spark Driver | The spark process scheduling the work and choreographing the commit operation.|
+| Job: in MapReduce | The entire application. In spark, this is a single stage in a chain of work |
+| Job Attempt | A single attempt at a job. MR supports multiple Job attempts with recovery on partial job failure. Spark says "start again from scratch" |
+| Task | a subsection of a job, such as processing one file, or one part of a file |
+| Task ID |  ID of the task, unique within this job. Usually starts at 0 and is used in filenames (part-0000, part-001, etc.) |
+| Task attempt (TA) | An attempt to perform a task. It may fail, in which case MR/spark will schedule another. |
+| Task Attempt ID | A unique ID for the task attempt. The Task ID + an attempt counter.|
+|  Destination directory | The final destination of work.|
+| Job Attempt Directory | A temporary directory used by the job attempt. This is always _underneath_ the destination directory, so as to ensure it is in the same encryption zone as HDFS, storage volume in other filesystems, etc.|
+| Task Attempt directory | Directory under the Job Attempt Directory where task attempts create subdiretories for their own work |
+| Task Attempt Working Directory| Directory exclusive for each task attempt under which files are written  |
+| Task Commit | Taking the output of a Task Attempt and making it the final/exclusive result of that "successful" Task.  |
+| Job Commit | aggregating all the outputs of all committed tasks and producing the final results of the job. |
+
+
+
+The purpose of a committer is to ensure that the complete output of
+a job ends up in the destination, even in the presence of failures of tasks.
+
+- _Complete:_ the output includes the work of all successful tasks.
+- _Exclusive:_ the output of unsuccessful tasks is not present.
+- _Concurrent:_ When multiple tasks are committed in parallel the output is the same as when
+  the task commits are serialized. This is not a requirement of Job Commit.
+- _Abortable:_  jobs and tasks may be aborted prior to job commit, after which their output is not visible.
+- _Continuity of correctness:_ once a job is committed, the output of any failed,
+  aborted, or unsuccessful task MUST NO appear at some point in the future.
+
+For Hive's classic hierarchical-directory-structured tables, job committing
+requires the output of all committed tasks to be put into the correct location
+in the directory tree.
+
+The committer built into `hadoop-mapreduce-client-core` module is the `FileOutputCommitter`.
+
+
+It has two algorithms, v1 and v2.
+
+The v1 algorithm is resilient to all forms of task failure, but slow
+when committing the final aggregate output as it renames each newly created file
+to the correct place in the table one by one.
+
+The v2 algorithm is not considered safe because the output is visible when individual
+tasks commit, rather than being delayed until job commit.
+It is possible for multiple task attempts to get their data into the output
+directory tree, and if a job fails/is aborted before the job is committed,
+thie output is visible.
+
+## File Output Committer V1 and V2
+
+### File Output Committer V1 and V2 Commit algorithms
+
+#### Task attempt execution (V1 and V2)
+
+job attempt directory in `$dest/__temporary/$jobAttemptId/` contains all output
+of the job in progress every task attempt is allocated its own task attempt dir
+`$dest/__temporary/$jobAttemptId/__temporary/$taskAttemptId`
+
+All work for a task is written under the task attempt directory. If the output
+is a deep tree with files at the root, the task attempt dir will end up with a
+similar structure, with the files it has generated and the directories above
+them.
+
+### MapReduce V1 algorithm:
+
+#### v1 Task commit
+
+The task attempt dir is renamed directly underneath the job attempt dir
+
+```
+rename(
+  $dest/__temporary/$jobAttemptId/__temporary/$taskAttemptId
+  $dest/__temporary/$jobAttemptId/$taskId)
+```
+
+#### V1 Job Commit
+
+For each committed task, all files underneath are renamed into the destination
+directory, with a filename relative from the base directory of the task remapped
+to that of the dest dir.
+
+That is, everything under `$dest/__temporary/$jobAttemptId/$taskId` is converted
+to a path under `$dest`.
+
+A recursive treewalk identifies the paths to rename in each TA directory.
+There's some optimisation if the task directory tree contains a subdirectory
+directory which does not exist under the destination: in this case the whole
+directory can be renamed. If the directory already exists, a file-by-file merge
+takes place for that dir, with the action for subdirectories again depending on
+the presence of the destination.
+
+As a result, if the output of each task goes a separate final directory (e.g the
+final partition is unique to a single task), the rename is O(1) for the dir,
+irrespective of children. If the output is to be in the same dir as other
+tasks (or updating existing directories), then the rename performance becomes O(
+files).
+
+Finally, a 0-byte `_SUCCESS` file is written iff `mapreduce.fileoutputcommitter.marksuccessfuljobs` is true.
+
+### MapReduce V2 algorithm:
+
+#### V2 Task commit
+
+The files under the task attempt dir are renamed one by one into the destination
+directory. There's no attempt at optimising directory renaming, because other
+tasks may be committing their work at the same time. It is therefore `O(files)` +
+the cost of listing the directory tree. Again: done with a recursive treewalk,
+not a deep `listFiles(path, recursive=true)` API, which would be faster on HDFS
+and (though not relevant here) S3.
+
+#### V2 Job Commit
+
+A 0-byte `_SUCCESS` file is written iff `mapreduce.fileoutputcommitter.marksuccessfuljobs`
+is true.
+
+
+
+### Why the V2 committer is incorrect/unsafe
+
+If, for a Task T1, Task Attempt 1 (T1A1) fails before committing, the driver
+will schedule a new attempt "T1A2", and commit it. All is good.
+
+But: if T1A1 was given permission to commit and it failed during the commit
+process, some of its output may have been written to the destination directory.
+
+If attempt T1A2 was then told to commit, then if and only if its output had the
+exact set of file names would any already-renamed files be overwritten. If
+different filenames were generated, then the output would contain files of T1A1
+and T1A2.
+
+If T1A1 became partitioned during the commit process, then the job committer
+would schedule another attempt and commit its work. However, if T1A1 still had
+connectivity to the filesystem, it could still be renaming files. The output of
+the two tasks could be intermingled even if the same filenames were used.
+
+## Background: the S3A Committers
+
+The paper, [_A Zero-Rename Committer_](https://github.com/steveloughran/zero-rename-committer/releases/),
+Loughran et. al., covers these committers
+
+It also describes the commit problem, defines correctness, and describes the
+algorithms of the v1 and v2 committers, as well as those of the S3A committers,
+IBM Stocator committer and what we know of EMR's Spark committer.
+
+The `hadoop-aws` JAR contains a pair of committers, "Staging" and "Magic". Both
+of these are implementations of the same problem: safely and rapidly committing
+work to an S3 object store.
+
+The committers take advantage of the fact that S3 offers an atomic way to create
+a file: the PUT request.
+
+Files either exist or they don't. A file can be uploaded direct to its
+destination, and it is only when the upload completes that the file is manifest
+-overwriting any existing copy.
+
+For large files, a multipart upload allows this upload operation to be split
+into a series of POST requests
+
+1 `initiate-upload (path -> upload ID)`
+1. `upload part(path, upload ID, data[]) -> checksum.`
+   This can be parallelised. Up to 10,000 parts can be uploaded to a single
+   object. All but the final part must be >= 5MB.
+1. `complete-upload (path, upload ID, List<checksum>)`
+   this manifests the file, building it from the parts in the sequence of blocks
+   defined by the ordering of the checksums.
+
+The secret for the S3A committers is that the final POST request can be delayed
+until the job commit phase, even though the files are uploaded during task
+attempt execution/commit. The task attempts need to determine the final
+destination of each file, upload the data as part of a multipart operation, then
+save the information needed to complete the upload in a file which is later read
+by the job committer and used in a POST request.
+
+### Staging Committer
+
+The _Staging Committer_ is based on the contribution by Ryan Blue of Netflix.
+it  relies on HDFS to be the consistent store to propagate the `.pendingset` files.
+
+The working directory of each task attempt is in the local filesystem, "the
+staging directory". The information needed to complete the uploads is passed
+from Task Attempts to the Job Committer by using a v1 FileOutputCommitter
+working with the cluster HDFS filesystem. This ensures that the committer has
+the same correctness guarantees as the v1 algorithm.
+
+1. Task commit consists of uploading all files under the local filesystem's task
+   attempt working directory to their final destination path, holding back on
+   the final manifestation POST.
+1. A JSON file containing all information needed to complete the upload of all
+   files in the task attempt is written to the Job Attempt directory of of the
+   wrapped committer working with HDFS.
+1. Job commit: load in all the manifest files in the HDFS job attempt directory,
+   then issued the POST request to complete the uploads. These are parallelised.
+
+
+### The Magic Committer
+
+The _Magic Committer_ is purely-S3A and takes advantage and of
+the fact the authorts could make changes within the file system client itself.
+
+"Magic" paths are defined which, when opened for writing under, initiate a
+multi-party upload to the final destination directory. When the output stream is
+`close()`d, a zero byte marker file is written to the magic path, and a JSON
+.pending file containing all the information needed to complete the upload is
+saved.
+
+Task commit:
+1. List all `.pending` files under each task attempt's magic directory;
+1. Aggregate to a `.pendingset` file
+1. Save to the job attempt directory with the task ID.
+
+Job commit:
+
+1. List `.pendingset` files in the job attempt directory
+1. Complete the uploads with POST requests.
+
+The Magic committer absolutely requires a consistent S3 Store -originally with
+S3Guard. Now that S3 is consistent, raw S3 can be used. It does not need HDFS
+or any other filesystem with `rename()`.
+
+### Correctness
+
+The S3A committer is considered correct because
+
+1. Nothing is materialized until job commit.
+1. Only one task attempt's manifest can be saved to the job attempt directory.
+   Hence: only of the TA's files of the same task ID are exclusively committed.
+1. The staging committer's use of HDFS to pass manifests from TAs to the Job
+   committer ensures that S3's eventual consistency would not cause manifests to
+   be missed.
+1. Until S3 was consistent, the magic committer relied on S3Guard to provide the
+   list consistency needed during both task- and job- commit.
+1. The authors and wider community fixed all the issues related to the committers
+   which have surfaced in production.
+
+Significant issues which were fixed include:
+
+* [HADOOP-15961](https://issues.apache.org/jira/browse/HADOOP-15961).
+  S3A committers: make sure there's regular progress() calls.
+* [HADOOP-16570](https://issues.apache.org/jira/browse/HADOOP-16570).
+  S3A committers encounter scale issues.
+* [HADOOP-16798](https://issues.apache.org/jira/browse/HADOOP-16798).
+  S3A Committer thread pool shutdown problems.
+* [HADOOP-17112](https://issues.apache.org/jira/browse/HADOOP-17112).
+  S3A committers can't handle whitespace in paths.
+* [HADOOP-17318](https://issues.apache.org/jira/browse/HADOOP-17318).
+  Support concurrent S3A commit jobs with same app attempt ID.
+* [HADOOP-17258](https://issues.apache.org/jira/browse/HADOOP-17258).
+  MagicS3GuardCommitter fails with `pendingset` already exists
+* [HADOOP-17414](https://issues.apache.org/jira/browse/HADOOP-17414]).
+  Magic committer files don't have the count of bytes written collected by spark
+* [SPARK-33230](https://issues.apache.org/jira/browse/SPARK-33230)
+  Hadoop committers to get unique job ID in `spark.sql.sources.writeJobUUID`
+* [SPARK-33402](https://issues.apache.org/jira/browse/SPARK-33402)
+  Jobs launched in same second have duplicate MapReduce JobIDs
+* [SPARK-33739](https://issues.apache.org/jira/browse/SPARK-33739]).
+  Jobs committed through the S3A Magic committer don't report
+  the bytes written (depends on HADOOP-17414)
+
+Of those which affected the correctness rather than scale/performance/UX:
+HADOOP-17258 involved the recovery from a failure after TA1 task commit had
+completed —but had failed to report in. SPARK-33402, SPARK-33230 and
+HADOOP-17318 are all related: if two spark jobs/stages started in the
+same second, they had the same job ID. This caused the HDFS directories used by
+the staging committers to be intermingled.
+
+What is notable is this: these are all problems which the minimal integration
+test suites did not discover.
+
+The good news: we now know of these issues and are better placed to avoid
+replicating them again. And know what to write tests for.
+
+## The V1 committer: slow in Azure and slow and unsafe on GCS.
+
+The V1 committer underperforms on ABFS because:
+
+1. Directory listing and file renaming is somewhat slower with ABFS than it is
+   with HDFS.
+1. The v1 committer sequentially commits the output of each task through a
+   listing of each committed task's output, moving directories when none exist
+   in the destination, merging files into extant directories.
+
+The V2 committer is much faster in the job commit because it performs the list
+and rename process in the task commit. Which, because it is non-atomic, is why
+it is considered dangerous to use. What the V2 task commit algorithm does show is
+that it is possible to parallelise committing the output of different tasks by
+using file-by-file rename exclusively.
+
+The V1 committer underperforms on GCS because even the task commit operation,
+—directory rename—, is a non-atomic `O(files)` operation.
+This also means that it is unsafe.
+
+If the task attempt has partitioned and the spark driver schedules/commits another TA, then,
+the task dir may contain 1+ file from the first attempt.
+
+----------------------------------------------------------------------------------------
+
+# The Manifest Committer Protocol
+
+## Requirements of the Store
+
+Stores/filesystems supported by this committer MUST:
+
+* Have consistent listings.
+* Have an atomic `O(1)` file rename operation.
+
+Stores/filesystems supported by this committer SHOULD:
+
+* Rename files successfully, even under load. ABFS does not do this,
+  so special recovery is provided there.
+* Implement the `EtagSource` interface of HADOOP-17979.
+  This is used for ABFS rename recovery, and for optional
+  validation of the final output.
+
+Stores/filesystems supported by this committer MAY:
+
+* Have list operations with high latency.
+* Reject calls under load with throttling responses,
+  which MUST be handled in the filesystem connector.
+
+Stores/filesystems supported by this committer MAY NOT:
+
+* Support atomic directory rename. This is never used except optionally in cleanup.
+* Support `O(1)` directory deletion. The `CleanupJobStage` assumes this is not
+  the case and so deletes task attempt directories in parallel.
+* Support an atomic `create(Path, overwrite=false)` operation.
+  The manifests are committed by writing to a path including the task attempt ID,
+  then renamed to their final path.
+* Support fast `listFiles(path, recursive=true)` calls.
+  This API call is not used.
+
+When compared with the `FileOutputCommitter`, the requirements
+which have been removed are:
+
+* Atomic directory rename.
+* `O(1)` directory deletion.
+* Fast directory listings.
+* The implicit absence of throttling behaviors.
+
+HDFS meets all those requirements, so does not benefit significantly from
+this committer, though it will still work there.
+
+The S3 store does not meet the rename requirements of this committer,
+even now that it is consistent.
+This committer is not safe to use on S3.
+
+### Task and Job IDs
+
+Every job MUST have a unique ID.
+
+The implementation expects the Spark runtime to have the relevant patches to
+ensure this.
+
+The job ID is used to name temporary directories, rather than using the classic
+incrementing natural numbering scheme of `_temporary/0/`.
+That scheme comes from MapReduce where job attempts of attempt ID &gt; 1
+look for tasks committed by predecessors and incorporate that into their
+results.
+
+This committer targets Spark, where there is no attempt at recovery.
+By using the job ID in paths, if jobs are configured to _not_ delete
+all of `_temporary` in job cleanup/abort, then multiple jobs
+MAY be executed using the same table as their destination.
+
+Task IDs and Task Attempt IDs will be derived from Job IDs as usual.
+
+It is expected that filenames of written files SHALL be unique.
+This is done in Spark for ORC and Parquet files, and allows for
+checks for destination files to be omitted by default.
+
+
+## Directory Structure
+
+Given a destination directory `destDir: Path`
+
+A job of id `jobID: String` and attempt number `jobAttemptNumber:int`
+will use the directory:
+
+```
+$destDir/_temporary/manifest_$jobID/$jobAttemptNumber/
+```
+
+For its work (note: it will actually format that final subdir with `%02d`).
+
+This is termed the _Job Attempt Directory_
+
+Under the Job Attempt Directory, a subdirectory `tasks` is
+created. This is termed the _Task Attempt Directory_.
+Every task attempt will have its own subdirectory of this,
+into which its work will be saved.
+
+Under the Job Attempt Directory, a subdirectory `manifests` is created.
+This is termed the _y_.
+
+The manifests of all committed tasks will be saved to this
+directory with the filename of
+`$taskId-manifest.json`
+
+The full path
+
+```
+$destDir/_temporary/manifest_$jobID/$jobAttemptNumber/manifests/$taskId-manifest.json
+```
+
+Is the final location for the manifest of all files created by
+a committed task. It is termed the _Manifest Path of a Committed Task_.
+
+Task attempts will save their manifest into this directory with
+a temporary filename
+`$taskAttemptId-manifest.json.tmp`.
+
+This is termed the _Temporary Path of a Task Attempt's Manifest_.
+
+For the job and task operations then, the following paths are
+defined.
+```
+let jobDirectory = "$destDir/_temporary/manifest_$jobID/"
+let jobAttemptDirectory = jobDirectory + "$jobAttemptNumber/"
+let manifestDirectory = jobAttemptDirectory + "manifests/"
+let taskAttemptDirectory = jobAttemptDirectory + "tasks/"
+```
+
+And for each task attempt, the following paths are also defined
+
+```
+let taskAttemptWorkingDirectory = taskAttemptDirectory + "$taskAttemptId"
+let taskManifestPath = manifestDirectory + "$taskId-manifest.json"
+let taskAttemptTemporaryManifestPath = manifestDirectory + "$taskAttemptId-manifest.json"
+```
+
+## Core Algorithm of the Protocol
+
+1. Each Task attempt writes all its files to a unique directory tree under the
+   Task Attempt Directory.
+2. Task Commit consists of a recursive scan of the directory for that task attempt,
+   creating a list of directories and a list of files.
+3. These lists are saved as a JSON manifest file.
+4. Job commit consists of listing all of the JSON manifest files,
+   loading their contents, creating the aggregate set of destination directories
+   and renaming all files into their final destinations.
+
+
+### The Intermediate Manifest
+
+This is JSON file is designed which contains (along with IOStatistics and some diagnostics)
+
+1. A list of destination directories which must be created if they do not exist.
+1. A list of files to rename as (absolute source, absolute destination,
+   file-size) entries.
+
+### Job Setup
+
+```
+mkdir(jobAttemptDirectory)
+mkdir(manifestDirectory)
+mkdir(taskAttemptDirectory)
+```
+
+### Task Setup
+
+```
+mkdir(taskAttemptWorkingDirectory)
+```
+
+### Task Commit
+
+Task attempts are committed by:
+
+1. Recursively listing the task attempt working dir to build
+   1. A list of destination directories under which files will be renamed,
+      and their status (exists, not_found, file)
+   2. A list of files to rename: source, destination, size and optionally, etag.
+2. These lists populate a JSON file, the _Intermediate Manifest_.
+3. The task attempt saves this file to its _Temporary Path of a Task Attempt's
+   Manifest_.
+4. The task attempt then deletes the _Manifest Path of a Committed Task_ and
+   renames its own manifest file to that path.
+5. If the rename succeeeds, the task commit is considered a success.
+
+No renaming takes place at this point.: the files are left in their original location until
+renamed in job commit.
+
+```
+let (renames, directories) = scan(taskAttemptWorkingDirectory)
+let manifest = new Manifest(renames, directories)
+
+manifest.save(taskAttemptTemporaryManifestPath)
+rename(taskAttemptTemporaryManifestPath, taskManifestPath)
+```
+
+### Task Abort/cleanup
+
+```
+delete(taskAttemptWorkingDirectory)
+```
+
+### Job Commit
+
+Job Commit consists of:
+
+1. List all manifest files in the job attempt directory.
+2. Load each manifest file, create directories which do not yet exist, then
+   rename each file in the rename list.
+3. Optionally save a JSON `_SUCCESS` file with the same format as the S3A committer (for
+   testing; use write and rename for atomic save)
+
+The job commit phase supports parallelization for many tasks and many files
+per task, specifically there is a thread pool for parallel store IO
+
+1. Manifest tasks are loaded and processed in parallel.
+1. Deletion of files where directories are intended to be created.
+1. Creation of leaf directories.
+1. File rename.
+1. In cleanup and abort: deletion of task attempt directories
+1. If validation of output is enabled for testing/debugging: getFileStatus calls
+   to compare file length and, if possible etags.
+
+```
+let manifestPaths = list("$manifestDirectory/*-manifest.json")
+let manifests = manifestPaths.map(p -> loadManifest(p))
+let directoriesToCreate = merge(manifests.directories)
+let filesToRename = concat(manifests.files)
+
+directoriesToCreate.map(p -> mkdirs(p))
+filesToRename.map((src, dest, etag) -> rename(src, dest, etag))
+
+if mapreduce.fileoutputcommitter.marksuccessfuljobs then
+  success.save("$destDir/_SUCCESS")
+
+```
+
+Implementation Note:
+
+To aid debugging and development, the summary be saved to a location
+in the same _or different_ filesystem; the intermediate
+manifests may be renamed to a location in the target filesystem.
+
+```
+if summary.report.directory != "" then
+  success.save("${summary.report.directory}/$jobID.json")
+if diagnostics.manifest.directory != null then
+  rename($manifestDirectory, "${diagnostics.manifest.directory}/$jobID")
+```
+
+The summary report is saved even if job commit fails for any reason
+
+### Job Abort/cleanup
+
+Job cleanup is nominally one of deleting the job directory
+```
+delete(jobDirectory)
+```
+
+To address scale issues with the object stores, this SHALL be preceeded by
+a (parallelized) deletion of all task attempt working directories
+
+```
+let taskAttemptWorkingDirectories = list("taskAttemptDirectory")
+taskAttemptWorkingDirectories.map(p -> delete(p))
+```
+
+
+## Benefits of the new protocol
+
+* Pushes the source tree list operations into the task commit phase, which is
+  generally off the critical path of execution.
+* Reduces the number of directories probed/created to the aggregate set of
+  output directories, with all duplicates eliminated.
+* File rename can be parallelized, with the limits being that of configured
+  thread pool sizes and/or any rate limiting constraints.
+* Provides an atomic task commit to GCS, as there is no expectation that
+  directory rename is atomic.
+* Permits pass IOStatistics from tasks attempts to the job committer via the manifests.
+* Allows for some pre-rename operations in the Job Committer
+  similar to the S3A "Partitioned Staging committer".
+  This can be configured to delete all existing entries in
+  directories scheduled to be created -or fail if those partitions are
+  non-empty.
+  See [Partitioned Staging Committer](../../hadoop-aws/tools/hadoop-aws/committers.html#The_.E2.80.9CPartitioned.E2.80.9D_Staging_Committer)
+* Allows for an optional preflight validation check (verify no duplicate files created by different tasks).
+* Manifests can be viewed, size of output determined, etc., during development/debugging.
+
+## Disadvantages of the new protocol compared to the v1 algorithm
+
+* Needs a new manifest file format.
+* Manifests may get large if tasks create many files and/or subdirectories, or if
+  etags are collected and the length of these tags is significant.
+  The HTTP protocol limits each etag to 8 KiB, so the cost may be 8 KiB per file.
+* Makes task commit more complex than the v1 algorithm.
+* Possibly suboptimal on jobs where individual tasks create unique output directories,
+  as directory rename will never be used to commit a directory.

+ 1085 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/AbstractManifestCommitterTest.java

@@ -0,0 +1,1085 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import org.assertj.core.api.Assertions;
+import org.junit.AfterClass;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.contract.AbstractFSContract;
+import org.apache.hadoop.fs.contract.AbstractFSContractTestBase;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.fs.contract.localfs.LocalFSContract;
+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SaveTaskManifestStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+import org.apache.hadoop.util.DurationInfo;
+import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.functional.CloseableTaskPoolSubmitter;
+import org.apache.hadoop.util.functional.RemoteIterators;
+import org.apache.hadoop.util.functional.TaskPool;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.readDataset;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics;
+import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER;
+import static org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory.COMMITTER_FACTORY_CLASS;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConfig.createCloseableTaskSubmitter;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_ID_SOURCE_MAPREDUCE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_FACTORY;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_DIAGNOSTICS_MANIFEST_DIR;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_SUMMARY_REPORT_DIR;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_VALIDATE_OUTPUT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.getProjectBuildDir;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.validateSuccessFile;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.NAME_FORMAT_JOB_ATTEMPT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createIOStatisticsStore;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createTaskManifest;
+import static org.apache.hadoop.util.functional.FutureIO.awaitFuture;
+
+/**
+ * Tests which work with manifest committers.
+ * This is a filesystem contract bound to the local filesystem;
+ * subclasses may change the FS to test against other stores.
+ * Some fields are set up in
+ * in {@link #executeOneTaskAttempt(int, int, int)},
+ * which is why fields are used.
+ * when synchronized access is needed; synchronize on (this) rather
+ * than individual fields
+ */
+public abstract class AbstractManifestCommitterTest
+    extends AbstractFSContractTestBase {
+
+  protected static final Logger LOG =
+      LoggerFactory.getLogger(AbstractManifestCommitterTest.class);
+
+  /**
+   * Some Job and task IDs.
+   */
+  protected static final ManifestCommitterTestSupport.JobAndTaskIDsForTests
+      TASK_IDS = new ManifestCommitterTestSupport.JobAndTaskIDsForTests(2, 2);
+
+  public static final int JOB1 = 1;
+
+  public static final int TASK0 = 0;
+
+  public static final int TASK1 = 1;
+
+  /**
+   * Task attempt 0 index.
+   */
+  public static final int TA0 = 0;
+
+  /**
+   * Task attempt 1 index.
+   */
+  public static final int TA1 = 1;
+
+  /**
+   * Depth of dir tree to generate.
+   */
+  public static final int DEPTH = 3;
+
+  /**
+   * Width of dir tree at every level.
+   */
+  public static final int WIDTH = 2;
+
+  /**
+   * How many files to create in the leaf directories.
+   */
+  public static final int FILES_PER_DIRECTORY = 4;
+
+  /**
+   * Pool size.
+   */
+  public static final int POOL_SIZE = 32;
+
+  /**
+   * FileSystem statistics are collected across every test case.
+   */
+  protected static final IOStatisticsSnapshot FILESYSTEM_IOSTATS =
+      snapshotIOStatistics();
+
+  /**
+   * Counter for creating files. Ensures that across all test suites,
+   * duplicate filenames are never created. Helps assign blame.
+   */
+  private static final AtomicLong CREATE_FILE_COUNTER = new AtomicLong();
+
+  protected static final byte[] NO_DATA = new byte[0];
+
+  /**
+   * The thread leak tracker.
+   */
+  private static final ThreadLeakTracker THREAD_LEAK_TRACKER = new ThreadLeakTracker();
+
+  private static final int MAX_LEN = 64_000;
+
+  /**
+   * Submitter for tasks; may be null.
+   */
+  private CloseableTaskPoolSubmitter submitter;
+
+  /**
+   * Stage statistics. Created in test setup, and in
+   * teardown updates {@link #FILESYSTEM_IOSTATS}.
+   */
+  private IOStatisticsStore stageStatistics;
+
+  /**
+   * Prefer to use these to interact with the FS to
+   * ensure more implicit coverage.
+   */
+  private ManifestStoreOperations storeOperations;
+
+  /**
+   * Progress counter used in all stage configs.
+   */
+  private final ProgressCounter progressCounter = new ProgressCounter();
+
+  /**
+   * Directory for job summary reports.
+   * This should be set up in test suites testing against real object stores.
+   */
+  private File reportDir;
+
+  /**
+   * List of task attempt IDs for those tests which create manifests.
+   */
+  private final List<String> taskAttemptIds = new ArrayList<>();
+
+  /**
+   * List of task IDs for those tests which create manifests.
+   */
+  private final List<String> taskIds = new ArrayList<>();
+
+  /**
+   * any job stage configuration created for operations.
+   */
+  private StageConfig jobStageConfig;
+
+  /**
+   * Destination dir of job.
+   */
+  private Path destDir;
+
+  /**
+   * When creating manifests, total data size.
+   */
+  private final AtomicLong totalDataSize = new AtomicLong();
+
+  /**
+   * Where to move manifests; must be in target FS.
+   */
+  private Path manifestDir;
+
+  /**
+   * Get the contract configuration.
+   * @return the config used to create the FS.
+   */
+  protected Configuration getConfiguration() {
+    return getContract().getConf();
+  }
+
+  /**
+   * Store operations to interact with..
+   * @return store operations.
+   */
+  protected ManifestStoreOperations getStoreOperations() {
+    return storeOperations;
+  }
+
+  /**
+   * Set store operations.
+   * @param storeOperations new value
+   */
+  protected void setStoreOperations(final ManifestStoreOperations storeOperations) {
+    this.storeOperations = storeOperations;
+  }
+
+  public List<String> getTaskAttemptIds() {
+    return taskAttemptIds;
+  }
+
+  public List<String> getTaskIds() {
+    return taskIds;
+  }
+
+  public long getTotalDataSize() {
+    return totalDataSize.get();
+  }
+
+  public Path getManifestDir() {
+    return manifestDir;
+  }
+
+  /**
+   * Set builder value.
+   * @param value new value
+   * @return the builder
+   */
+  public AbstractManifestCommitterTest withManifestDir(Path value) {
+    manifestDir = value;
+    return this;
+  }
+
+  /**
+   * Describe a test in the logs.
+   * @param text text to print
+   * @param args arguments to format in the printing
+   */
+  protected void describe(String text, Object... args) {
+    LOG.info("\n\n{}: {}\n",
+        getMethodName(),
+        String.format(text, args));
+  }
+
+  /**
+   * Local FS unless overridden.
+   * @param conf configuration
+   * @return the FS contract.
+   */
+  @Override
+  protected AbstractFSContract createContract(final Configuration conf) {
+    return new LocalFSContract(conf);
+  }
+
+  /** Enable the manifest committer options in the configuration. */
+  @Override
+  protected Configuration createConfiguration() {
+    return enableManifestCommitter(super.createConfiguration());
+  }
+
+  @Override
+  public void setup() throws Exception {
+
+    // set the manifest committer to a localfs path for reports across
+    // all threads.
+    // do this before superclass setup so reportDir is non-null there
+    // and can be used in creating the configuration.
+    reportDir = new File(getProjectBuildDir(), "reports");
+    reportDir.mkdirs();
+
+    // superclass setup includes creating a filesystem instance
+    // for the target store.
+    super.setup();
+
+    manifestDir = path("manifests");
+
+    // destination directory defaults to method path in
+    // target FS
+    setDestDir(methodPath());
+
+    // stage statistics
+    setStageStatistics(createIOStatisticsStore().build());
+    // thread pool for task submission.
+    setSubmitter(createCloseableTaskSubmitter(POOL_SIZE, TASK_IDS.getJobId()));
+    // store operations for the target filesystem.
+    storeOperations = createManifestStoreOperations();
+  }
+
+  /**
+   * Overrride point: create the store operations.
+   * @return store operations for this suite.
+   */
+  protected ManifestStoreOperations createManifestStoreOperations() throws IOException {
+    final FileSystem fs = getFileSystem();
+    return ManifestCommitterSupport.createManifestStoreOperations(fs.getConf(), fs, getTestPath());
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    Thread.currentThread().setName("teardown");
+
+    IOUtils.cleanupWithLogger(LOG, storeOperations, getSubmitter());
+    storeOperations = null;
+    super.teardown();
+    FILESYSTEM_IOSTATS.aggregate(retrieveIOStatistics(getFileSystem()));
+    FILESYSTEM_IOSTATS.aggregate(getStageStatistics());
+  }
+
+  /**
+   * Add a long delay so that you don't get timeouts when working
+   * with object stores or debugging.
+   * @return a longer timeout than the base classes.
+   */
+  @Override
+  protected int getTestTimeoutMillis() {
+    return 600_000;
+  }
+
+  protected Path getTestPath() {
+    return getContract().getTestPath();
+  }
+
+  /**
+   * Get the task submitter.
+   * @return submitter or null
+   */
+  protected CloseableTaskPoolSubmitter getSubmitter() {
+    return submitter;
+  }
+
+  /**
+   * Set the task submitter.
+   * @param submitter new value.
+   */
+  protected void setSubmitter(CloseableTaskPoolSubmitter submitter) {
+    this.submitter = submitter;
+  }
+
+  /**
+   * Get the executor which the submitter also uses.
+   * @return an executor.
+   */
+  protected ExecutorService getExecutorService() {
+    return getSubmitter().getPool();
+  }
+  /**
+   * @return IOStatistics for stage.
+   */
+  protected final IOStatisticsStore getStageStatistics() {
+    return stageStatistics;
+  }
+
+  /**
+   * Set the statistics.
+   * @param stageStatistics statistics.
+   */
+  protected final void setStageStatistics(IOStatisticsStore stageStatistics) {
+    this.stageStatistics = stageStatistics;
+  }
+
+  /**
+   * Get the progress counter invoked during commit operations.
+   * @return progress counter.
+   */
+  protected final ProgressCounter getProgressCounter() {
+    return progressCounter;
+  }
+
+  /**
+   * Get the report directory.
+   * @return report directory.
+   */
+  public final File getReportDir() {
+    return reportDir;
+  }
+
+  /**
+   * Get the report directory as a URI.
+   * @return report directory.
+   */
+  public final URI getReportDirUri() {
+    return getReportDir().toURI();
+  }
+
+  /**
+   * Get the (shared) thread leak tracker.
+   * @return the thread leak tracker.
+   */
+  protected static ThreadLeakTracker getThreadLeakTracker() {
+    return THREAD_LEAK_TRACKER;
+  }
+
+  /**
+   * Make sure there's no thread leakage.
+   */
+  @AfterClass
+  public static void threadLeakage() {
+    THREAD_LEAK_TRACKER.assertNoThreadLeakage();
+  }
+
+  /**
+   * Dump the filesystem statistics after the class.
+   */
+  @AfterClass
+  public static void dumpFileSystemIOStatistics() {
+    LOG.info("Aggregate FileSystem Statistics {}",
+        ioStatisticsToPrettyString(FILESYSTEM_IOSTATS));
+  }
+
+  /**
+   * Create a directory tree through an executor.
+   * dirs created = width^depth;
+   * file count = width^depth * files
+   * If createDirs == true, then directories are created at the bottom,
+   * not files.
+   * @param base base dir
+   * @param prefix prefix for filenames.
+   * @param executor submitter.
+   * @param depth depth of dirs
+   * @param width width of dirs
+   * @param files files to add in each base dir.
+   * @param createDirs create directories rather than files?
+   * @return the list of paths
+   * @throws IOException failure.
+   */
+  public final List<Path> createFilesOrDirs(Path base,
+      String prefix,
+      ExecutorService executor,
+      int depth,
+      int width,
+      int files,
+      boolean createDirs) throws IOException {
+
+    try (DurationInfo ignored = new DurationInfo(LOG, true,
+        "Creating Files %s (%d, %d, %d) under %s",
+        prefix, depth, width, files, base)) {
+
+      assertPathExists("Task attempt dir", base);
+
+      // create the files in the thread pool.
+      List<Future<Path>> futures = createFilesOrDirs(
+          new ArrayList<>(),
+          base, prefix,
+          executor,
+          depth, width, files,
+          createDirs);
+      List<Path> result = new ArrayList<>();
+
+      // now wait for the creations to finish.
+      for (Future<Path> f : futures) {
+        result.add(awaitFuture(f));
+      }
+      return result;
+    }
+  }
+
+  /**
+   * Counter incremented for each file created.
+   */
+  private final AtomicLong fileDataGenerator = new AtomicLong();
+
+  /**
+   * Create files or directories; done in a treewalk and building up
+   * a list of futures to wait for. The list is
+   * build up incrementally rather than through some merging of
+   * lists created down the tree.
+   * If createDirs == true, then directories are created at the bottom,
+   * not files.
+   *
+   * @param futures list of futures to build up.
+   * @param base base dir
+   * @param prefix prefix for filenames.
+   * @param executor submitter.
+   * @param depth depth of dirs
+   * @param width width of dirs
+   * @param files files to add in each base dir.
+   * @param createDirs create directories rather than files?
+   * @return the list of futures
+   */
+  private List<Future<Path>> createFilesOrDirs(
+      List<Future<Path>> futures,
+      Path base,
+      String prefix,
+      ExecutorService executor,
+      int depth,
+      int width,
+      int files,
+      boolean createDirs) {
+
+    if (depth > 0) {
+      // still creating directories
+      for (int i = 0; i < width; i++) {
+        Path child = new Path(base,
+            String.format("dir-%02d-%02d", depth, i));
+        createFilesOrDirs(futures, child, prefix, executor, depth - 1, width, files, false);
+      }
+    } else {
+      // time to create files
+      for (int i = 0; i < files; i++) {
+        Path file = new Path(base,
+            String.format("%s-%04d", prefix,
+                CREATE_FILE_COUNTER.incrementAndGet()));
+        // buld the data. Not actually used in mkdir.
+        long entry = fileDataGenerator.incrementAndGet() & 0xffff;
+        byte[] data = new byte[2];
+        data[0] = (byte) (entry & 0xff);
+        data[1] = (byte) ((entry & 0xff00) >> 8);
+        // the async operation.
+        Future<Path> f = executor.submit(() -> {
+          if (!createDirs) {
+            // create files
+            ContractTestUtils.createFile(getFileSystem(), file, true, data);
+          } else {
+            // create directories
+            mkdirs(file);
+          }
+          return file;
+        });
+        futures.add(f);
+      }
+    }
+    return futures;
+  }
+
+  /**
+   * Create a list of paths under a dir.
+   * @param base base dir
+   * @param count count
+   * @return the list
+   */
+  protected List<Path> subpaths(Path base, int count) {
+    return IntStream.rangeClosed(1, count)
+        .mapToObj(i -> new Path(base, String.format("entry-%02d", i)))
+        .collect(Collectors.toList());
+  }
+
+  /**
+   * Submit a mkdir call to the executor pool.
+   * @param path path of dir to create.
+   * @return future
+   */
+  protected CompletableFuture<Path> asyncMkdir(final Path path) {
+    CompletableFuture<Path> f = new CompletableFuture<>();
+    getExecutorService().submit(() -> {
+      try {
+        mkdirs(path);
+        f.complete(path);
+      } catch (IOException e) {
+        f.completeExceptionally(e);
+      }
+    });
+    return f;
+  }
+
+  /**
+   * Given a list of paths, create the dirs async.
+   * @param paths path list
+   * @throws IOException failure
+   */
+  protected void asyncMkdirs(Collection<Path> paths) throws IOException {
+    List<CompletableFuture<Path>> futures = new ArrayList<>();
+    // initiate
+    for (Path path: paths) {
+      futures.add(asyncMkdir(path));
+    }
+    // await
+    for (Future<Path> f : futures) {
+      awaitFuture(f);
+    }
+  }
+
+  /**
+   * Submit an oepration to create a file to the executor pool.
+   * @param path path of file to create.
+   * @return future
+   */
+  protected CompletableFuture<Path> asyncPut(final Path path, byte[] data) {
+    CompletableFuture<Path> f = new CompletableFuture<>();
+    getExecutorService().submit(() -> {
+      try {
+        ContractTestUtils.createFile(getFileSystem(), path, true, data);
+        f.complete(path);
+      } catch (IOException e) {
+        f.completeExceptionally(e);
+      }
+    });
+    return f;
+  }
+
+  /**
+   * Convert the manifest list to a map by task attempt ID.
+   * @param list manifests
+   * @return a map, indexed by task attempt ID.
+   */
+  protected Map<String, TaskManifest> toMap(List<TaskManifest> list) {
+    return list.stream()
+        .collect(Collectors.toMap(TaskManifest::getTaskAttemptID, x -> x));
+  }
+
+  /**
+   * Verify the manifest files match the list of paths.
+   * @param manifest manifest to audit
+   * @param files list of files.
+   */
+  protected void verifyManifestFilesMatch(final TaskManifest manifest,
+      final List<Path> files) {
+    // get the list of source paths
+    Set<Path> filesToRename = manifest.getFilesToCommit()
+        .stream()
+        .map(FileEntry::getSourcePath)
+        .collect(Collectors.toSet());
+    // which must match that of all the files created
+    Assertions.assertThat(filesToRename)
+        .containsExactlyInAnyOrderElementsOf(files);
+  }
+
+  /**
+   * Verify that a task manifest has a given attempt ID.
+   * @param manifest manifest, may be null.
+   * @param attemptId expected attempt ID
+   * @return the manifest, guaranteed to be non-null and of task attempt.
+   */
+  protected TaskManifest verifyManifestTaskAttemptID(
+      final TaskManifest manifest,
+      final String attemptId) {
+    Assertions.assertThat(manifest)
+        .describedAs("Manifest of task %s", attemptId)
+        .isNotNull();
+    Assertions.assertThat(manifest.getTaskAttemptID())
+        .describedAs("Task Attempt ID of manifest %s", manifest)
+        .isEqualTo(attemptId);
+    return manifest;
+  }
+
+  /**
+   * Assert that a path must exist; return the path.
+   * @param message text for error message.
+   * @param path path to validate.
+   * @return the path
+   * @throws IOException IO Failure
+   */
+  Path pathMustExist(final String message,
+      final Path path) throws IOException {
+    assertPathExists(message, path);
+    return path;
+  }
+
+  /**
+   * Assert that a path must exist; return the path.
+   * It must also equal the expected value.
+   * @param message text for error message.
+   * @param expectedPath expected path.
+   * @param actualPath path to validate.
+   * @return the path
+   * @throws IOException IO Failure
+   */
+  Path verifyPath(final String message,
+      final Path expectedPath,
+      final Path actualPath) throws IOException {
+    Assertions.assertThat(actualPath)
+        .describedAs(message)
+        .isEqualTo(expectedPath);
+    return pathMustExist(message, actualPath);
+  }
+
+  /**
+   * Verify that the specified dir has the {@code _SUCCESS} marker
+   * and that it can be loaded.
+   * The contents will be logged and returned.
+   * @param dir directory to scan
+   * @param jobId job ID, only verified if non-empty
+   * @return the loaded success data
+   * @throws IOException IO Failure
+   */
+  protected ManifestSuccessData verifySuccessMarker(Path dir, String jobId)
+      throws IOException {
+    return validateSuccessFile(getFileSystem(), dir, 0, jobId);
+  }
+
+  /**
+   * Read a UTF-8 file.
+   * @param path path to read
+   * @return string value
+   * @throws IOException IO failure
+   */
+  protected String readFile(Path path) throws IOException {
+    return ContractTestUtils.readUTF8(getFileSystem(), path, -1);
+  }
+
+  /**
+   * Modify a (job) config to switch to the manifest committer;
+   * output validation is also enabled.
+   * @param conf config to patch.
+   * @return the updated configuration.
+   */
+  protected Configuration enableManifestCommitter(final Configuration conf) {
+    conf.set(COMMITTER_FACTORY_CLASS, MANIFEST_COMMITTER_FACTORY);
+    // always create a job marker
+    conf.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true);
+    // and validate the output, for extra rigorousness
+    conf.setBoolean(OPT_VALIDATE_OUTPUT, true);
+
+    // set the manifest rename dir if non-null
+    if (getManifestDir() != null) {
+      conf.set(OPT_DIAGNOSTICS_MANIFEST_DIR,
+          getManifestDir().toUri().toString());
+    }
+
+    // and bind the report dir
+    conf.set(OPT_SUMMARY_REPORT_DIR, getReportDirUri().toString());
+    return conf;
+  }
+
+  /**
+   * Create the stage config for a job but don't finalize it.
+   * Uses {@link #TASK_IDS} for job/task ID.
+   * @param jobAttemptNumber job attempt number
+   * @param outputPath path where the final output goes
+   * @return the config
+   */
+  protected StageConfig createStageConfigForJob(
+      final int jobAttemptNumber,
+      final Path outputPath) {
+    return createStageConfig(jobAttemptNumber, -1, 0, outputPath);
+  }
+
+  /**
+   * Create the stage config for job or task but don't finalize it.
+   * Uses {@link #TASK_IDS} for job/task ID.
+   * @param jobAttemptNumber job attempt number
+   * @param taskIndex task attempt index; -1 for job attempt only.
+   * @param taskAttemptNumber task attempt number
+   * @param outputPath path where the final output goes
+   * @return the config
+   */
+  protected StageConfig createStageConfig(
+      final int jobAttemptNumber,
+      final int taskIndex,
+      final int taskAttemptNumber,
+      final Path outputPath) {
+    final String jobId = TASK_IDS.getJobId();
+    ManifestCommitterSupport.AttemptDirectories attemptDirs =
+        new ManifestCommitterSupport.AttemptDirectories(outputPath,
+            jobId, jobAttemptNumber);
+    StageConfig config = new StageConfig();
+    config
+        .withIOProcessors(getSubmitter())
+        .withIOStatistics(getStageStatistics())
+        .withJobId(jobId)
+        .withJobIdSource(JOB_ID_SOURCE_MAPREDUCE)
+        .withJobAttemptNumber(jobAttemptNumber)
+        .withJobDirectories(attemptDirs)
+        .withName(String.format(NAME_FORMAT_JOB_ATTEMPT, jobId))
+        .withOperations(getStoreOperations())
+        .withProgressable(getProgressCounter());
+
+    // if there's a task attempt ID set, set up its details
+    if (taskIndex >= 0) {
+      String taskAttempt = TASK_IDS.getTaskAttempt(taskIndex,
+          taskAttemptNumber);
+      config
+          .withTaskAttemptId(taskAttempt)
+          .withTaskId(TASK_IDS.getTaskIdType(taskIndex).toString())
+          .withTaskAttemptDir(
+              attemptDirs.getTaskAttemptPath(taskAttempt));
+    }
+    return config;
+  }
+
+  /**
+   * A job stage config.
+   * @return stage config or null.
+   */
+  protected StageConfig getJobStageConfig() {
+    return jobStageConfig;
+  }
+
+  protected void setJobStageConfig(StageConfig jobStageConfig) {
+    this.jobStageConfig = jobStageConfig;
+  }
+
+  protected Path getDestDir() {
+    return destDir;
+  }
+
+  protected void setDestDir(Path destDir) {
+    this.destDir = destDir;
+  }
+
+  /**
+   * Execute a set of tasks; task ID is a simple count.
+   * task attempt is lowest 2 bits of task ID.
+   * @param taskAttemptCount number of tasks.
+   * @param filesPerTaskAttempt number of files to include in manifest.
+   * @return the manifests.
+   * @throws IOException IO failure.
+   */
+  protected List<TaskManifest> executeTaskAttempts(int taskAttemptCount,
+      int filesPerTaskAttempt) throws IOException {
+
+    try (DurationInfo di = new DurationInfo(LOG, true, "create manifests")) {
+
+      // build a list of the task IDs.
+      // it's really hard to create a list of Integers; the java8
+      // IntStream etc doesn't quite fit as they do their best
+      // keep things unboxed, trying to map(Integer::valueOf) doesn't help.
+      List<Integer> taskIdList = new ArrayList<>(taskAttemptCount);
+      for (int t = 0; t < taskAttemptCount; t++) {
+        taskIdList.add(t);
+      }
+
+      /// execute the tasks
+      List<TaskManifest> manifests = Collections.synchronizedList(
+          new ArrayList<>());
+
+      // then submit their creation/save to the pool.
+      TaskPool.foreach(taskIdList)
+          .executeWith(getSubmitter())
+          .stopOnFailure()
+          .run(i -> {
+            manifests.add(
+                executeOneTaskAttempt(i, i & 0x03, filesPerTaskAttempt));
+          });
+      return manifests;
+
+    }
+  }
+
+  /**
+   * Create at task ID and attempt (adding to taskIDs and taskAttemptIds)
+   * setup the task, create a manifest with fake task entries
+   * and save that manifest to the job attempt dir.
+   * No actual files are created.
+   * @param task task index
+   * @param taskAttempt task attempt value
+   * @param filesPerTaskAttempt number of files to include in manifest.
+   * @return the manifest
+   * @throws IOException failure
+   */
+  protected TaskManifest executeOneTaskAttempt(final int task,
+      int taskAttempt, final int filesPerTaskAttempt) throws IOException {
+
+    String tid = String.format("task_%03d", task);
+    String taskAttemptId = String.format("%s_%02d",
+        tid, taskAttempt);
+    synchronized (this) {
+      taskIds.add(tid);
+      taskAttemptIds.add(taskAttemptId);
+    }
+    // for each task, a job config is created then patched with the task info
+    StageConfig taskStageConfig = createTaskStageConfig(JOB1, tid, taskAttemptId);
+
+    LOG.info("Generating manifest for {}", taskAttemptId);
+
+    // task setup: create dest dir.
+    // This helps generate a realistic
+    // workload for the parallelized job cleanup.
+    new SetupTaskStage(taskStageConfig).apply("task " + taskAttemptId);
+
+    final TaskManifest manifest = createTaskManifest(taskStageConfig);
+
+    Path taDir = taskStageConfig.getTaskAttemptDir();
+    long size = task * 1000_0000L;
+
+    // for each task, 10 dirs, one file per dir.
+    for (int i = 0; i < filesPerTaskAttempt; i++) {
+      Path in = new Path(taDir, "dir-" + i);
+      Path out = new Path(getDestDir(), "dir-" + i);
+      manifest.addDirectory(DirEntry.dirEntry(out, 0, 1));
+      String name = taskStageConfig.getTaskAttemptId() + ".csv";
+      Path src = new Path(in, name);
+      Path dest = new Path(out, name);
+      long fileSize = size + i * 1000L;
+      manifest.addFileToCommit(
+          new FileEntry(src, dest, fileSize, Long.toString(fileSize, 16)));
+      totalDataSize.addAndGet(fileSize);
+    }
+
+    // save the manifest for this stage.
+    new SaveTaskManifestStage(taskStageConfig).apply(manifest);
+    return manifest;
+  }
+
+  public StageConfig createTaskStageConfig(final int jobId, final String tid,
+      final String taskAttemptId) {
+    Path jobAttemptTaskSubDir = getJobStageConfig().getJobAttemptTaskSubDir();
+    StageConfig taskStageConfig = createStageConfigForJob(jobId, getDestDir())
+        .withTaskId(tid)
+        .withTaskAttemptId(taskAttemptId)
+        .withTaskAttemptDir(new Path(jobAttemptTaskSubDir, taskAttemptId));
+    return taskStageConfig;
+  }
+
+  /**
+   * Verify that the job directories have been cleaned up.
+   * @throws IOException IO failure
+   */
+  protected void verifyJobDirsCleanedUp() throws IOException {
+    StageConfig stageConfig = getJobStageConfig();
+    assertPathDoesNotExist("Job attempt dir", stageConfig.getJobAttemptDir());
+    assertPathDoesNotExist("dest temp dir", stageConfig.getOutputTempSubDir());
+  }
+
+  /**
+   * List a directory/directory tree and print files.
+   * @param fileSystem FS
+   * @param path path
+   * @param recursive do a recursive listing?
+   * @return the number of files found.
+   * @throws IOException failure.
+   */
+  public static long lsR(FileSystem fileSystem, Path path, boolean recursive)
+      throws Exception {
+    if (path == null) {
+      // surfaces when someone calls getParent() on something at the top
+      // of the path
+      LOG.info("Empty path");
+      return 0;
+    } else {
+      LOG.info("Listing of {}", path);
+      final long count = RemoteIterators.foreach(
+          fileSystem.listFiles(path, recursive),
+          (status) -> LOG.info("{}", status));
+      LOG.info("Count of entries: {}", count);
+      return count;
+    }
+  }
+
+  /**
+   * Assert that a cleanup stage coursehad a given outcome and
+   * deleted the given number of directories.
+   * @param result result to analyze
+   * @param outcome expected outcome
+   * @param expectedDirsDeleted #of directories deleted. -1 for no checks
+   */
+  protected void assertCleanupResult(
+      CleanupJobStage.Result result,
+      CleanupJobStage.Outcome outcome,
+      int expectedDirsDeleted) {
+    Assertions.assertThat(result.getOutcome())
+        .describedAs("Outcome of cleanup() in %s", result)
+        .isEqualTo(outcome);
+    if (expectedDirsDeleted >= 0) {
+      Assertions.assertThat(result.getDeleteCalls())
+          .describedAs("Number of directories deleted in cleanup %s", result)
+          .isEqualTo(expectedDirsDeleted);
+    }
+  }
+
+  /**
+   * Create and execute a cleanup stage.
+   * @param enabled is the stage enabled?
+   * @param deleteTaskAttemptDirsInParallel delete task attempt dirs in
+   *        parallel?
+   * @param suppressExceptions suppress exceptions?
+   * @param outcome expected outcome.
+   * @param expectedDirsDeleted #of directories deleted. -1 for no checks
+   * @return the result
+   * @throws IOException non-suppressed exception
+   */
+  protected CleanupJobStage.Result cleanup(
+      final boolean enabled,
+      final boolean deleteTaskAttemptDirsInParallel,
+      final boolean suppressExceptions,
+      final CleanupJobStage.Outcome outcome,
+      final int expectedDirsDeleted) throws IOException {
+    StageConfig stageConfig = getJobStageConfig();
+    CleanupJobStage.Result result = new CleanupJobStage(stageConfig)
+        .apply(new CleanupJobStage.Arguments(OP_STAGE_JOB_CLEANUP,
+            enabled, deleteTaskAttemptDirsInParallel, suppressExceptions));
+    assertCleanupResult(result, outcome, expectedDirsDeleted);
+    return result;
+  }
+
+  /**
+   * Read the UTF_8 text in the file.
+   * @param path path to read
+   * @return the string
+   * @throws IOException failure
+   */
+  protected String readText(final Path path) throws IOException {
+
+    final FileSystem fs = getFileSystem();
+    final FileStatus st = fs.getFileStatus(path);
+    Assertions.assertThat(st.getLen())
+        .describedAs("length of file %s", st)
+        .isLessThanOrEqualTo(MAX_LEN);
+
+    return new String(
+        readDataset(fs, path, (int) st.getLen()),
+        StandardCharsets.UTF_8);
+  }
+
+  /**
+   * Counter.
+   */
+  protected static final class ProgressCounter implements Progressable {
+
+    private final AtomicLong counter = new AtomicLong();
+
+    /**
+     * Increment the counter.
+     */
+    @Override
+    public void progress() {
+      counter.incrementAndGet();
+    }
+
+    /**
+     * Get the counter value.
+     * @return the current value.
+     */
+    public long value() {
+      return counter.get();
+    }
+
+    /**
+     * Reset the counter.
+     */
+    public void reset() {
+      counter.set(0);
+    }
+
+    @Override
+    public String toString() {
+      final StringBuilder sb = new StringBuilder(
+          "ProgressCounter{");
+      sb.append("counter=").append(counter.get());
+      sb.append('}');
+      return sb.toString();
+    }
+  }
+
+  /**
+   * Get the progress counter of a stage.
+   * @param stageConfig stage
+   * @return its progress counter.
+   */
+  ProgressCounter progressOf(StageConfig stageConfig) {
+    return (ProgressCounter) stageConfig.getProgressable();
+  }
+}

+ 152 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/CommitterTestBinding.java

@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.mapreduce.MRJobConfig;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskID;
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.apache.hadoop.util.Progressable;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitter.TASK_COMMITTER;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.randomJobId;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createIOStatisticsStore;
+
+/**
+ * This class represents a binding to a job in the target dir with TA, JA
+ * and associated paths.
+ * It's self contained so as to be usable in any test suite.
+ */
+class CommitterTestBinding implements
+    IOStatisticsSource {
+
+  /**
+   * IOStatistics counter for progress events.
+   */
+  public static final String PROGRESS_EVENTS = "progress_events";
+
+  /**
+   * IOStatistics to update with progress events.
+   */
+  private final IOStatisticsStore iostatistics;
+
+  /**
+   * Job attempt ID:.
+   */
+  private final String jobAttemptId;
+
+  /**
+   * Job ID.
+   */
+  private final JobID jobId;
+
+  /**
+   * Task Attempt ID, under the job attempt.
+   */
+  private final TaskAttemptID taskAttemptId;
+
+  /**
+   * Task ID.
+   */
+  private final TaskID taskId;
+
+  /**
+   * Task attempt context for the given task Attempt.
+   */
+  private final TaskAttemptContext taskAttemptContext;
+
+  /**
+   * Construct.
+   * @param conf job/task config. This is patched with the app attempt.
+   * @param appAttempt application attempt.
+   * @param taskNumber task number
+   * @param taskAttemptNumber which attempt on this task is it
+   */
+  CommitterTestBinding(
+      Configuration conf,
+      int appAttempt, int taskNumber, int taskAttemptNumber) {
+    iostatistics = createIOStatisticsStore()
+        .withCounters(PROGRESS_EVENTS)
+        .build();
+
+
+    // this is the job ID, with no attempt info.
+    jobId = JobID.forName(randomJobId());
+    jobAttemptId = jobId.toString() + "_ " + appAttempt;
+    taskId = new TaskID(jobId, TaskType.MAP, taskNumber);
+    taskAttemptId = new TaskAttemptID(taskId,
+        taskAttemptNumber);
+    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, appAttempt);
+    taskAttemptContext = new TaskAttemptContextImpl(conf, taskAttemptId);
+
+  }
+
+  /**
+   * Create a committer config for the given output path.
+   * @param outputPath output path in destFS.
+   * @return a committer for the active task.
+   */
+  ManifestCommitterConfig createCommitterConfig(
+      Path outputPath) {
+    return new ManifestCommitterConfig(outputPath,
+        TASK_COMMITTER,
+        taskAttemptContext,
+        iostatistics,
+        null);
+  }
+
+  /**
+   * Create a stage config from the committer config.
+   * All stats go to the local IOStatisticsStore;
+   * there's a progress callback also set to increment
+   * the counter {@link #PROGRESS_EVENTS}
+   * @return a stage config
+   */
+  StageConfig createStageConfig(Path outputPath) {
+    return createCommitterConfig(outputPath)
+        .createStageConfig()
+        .withProgressable(new ProgressCallback());
+  }
+
+  @Override
+  public IOStatisticsStore getIOStatistics() {
+    return iostatistics;
+  }
+
+  /**
+   * Whenever this progress callback is invoked, the progress_events
+   * counter is incremented. This allows for tests to verify that
+   * callbacks have occurred by asserting on the event counter.
+   */
+  private final class ProgressCallback implements Progressable {
+
+    @Override
+    public void progress() {
+      iostatistics.incrementCounter(PROGRESS_EVENTS, 1);
+    }
+  }
+}

+ 420 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterTestSupport.java

@@ -0,0 +1,420 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.assertj.core.api.Assertions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskID;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestPrinter;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+import org.apache.hadoop.util.functional.RemoteIterators;
+
+import static org.apache.commons.lang3.StringUtils.isNotEmpty;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_CLASSNAME;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Support for committer tests.
+ */
+public final class ManifestCommitterTestSupport {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      ManifestCommitterTestSupport.class);
+
+  private static final DateTimeFormatter FORMATTER =
+      DateTimeFormatter.ofPattern("yyyyMMddHHmmss");
+
+  /**
+   * Build directory property.
+   * Value: {@value}.
+   */
+  public static final String PROJECT_BUILD_DIRECTORY_PROPERTY
+      = "project.build.directory";
+
+  /**
+   * default number of task attempts for some tests.
+   * Value: {@value}.
+   */
+  public static final int NUMBER_OF_TASK_ATTEMPTS = 200;
+
+  private ManifestCommitterTestSupport() {
+  }
+
+  /**
+   * Create a random Job ID using the fork ID as part of the number if
+   * set in the current process.
+   * @return fork ID string in a format parseable by Jobs
+   */
+  public static String randomJobId() {
+    String testUniqueForkId = System.getProperty("test.unique.fork.id", "0001");
+    int l = testUniqueForkId.length();
+    String trailingDigits = testUniqueForkId.substring(l - 4, l);
+    int digitValue;
+    try {
+      digitValue = Integer.valueOf(trailingDigits);
+    } catch (NumberFormatException e) {
+      digitValue = 0;
+    }
+
+    return String.format("%s%04d_%04d",
+        FORMATTER.format(LocalDateTime.now()),
+        (long) (Math.random() * 1000),
+        digitValue);
+  }
+
+  public static File getProjectBuildDir() {
+    String propval = System.getProperty(PROJECT_BUILD_DIRECTORY_PROPERTY);
+    if (StringUtils.isEmpty(propval)) {
+      propval = "target";
+    }
+    return new File(propval).getAbsoluteFile();
+  }
+
+  /**
+   * Load a success file; fail if the file is empty/nonexistent.
+   * @param fs filesystem
+   * @param outputPath directory containing the success file.
+   * @return the loaded file.
+   * @throws IOException failure to find/load the file
+   * @throws AssertionError file is 0-bytes long,
+   */
+  public static ManifestSuccessData loadSuccessFile(final FileSystem fs,
+      final Path outputPath) throws IOException {
+    Path success = new Path(outputPath, SUCCESS_MARKER);
+    return ManifestSuccessData.load(fs, success);
+  }
+
+  /**
+   * Load in the success data marker.
+   * @param fs filesystem
+   * @param outputDir ouptu path of job
+   * @param minimumFileCount minimum number of files to have been created
+   * @param jobId job ID, only verified if non-empty
+   * @return the success data
+   * @throws IOException IO failure
+   */
+  public static ManifestSuccessData validateSuccessFile(
+      final FileSystem fs,
+      final Path outputDir,
+      final int minimumFileCount,
+      final String jobId) throws IOException {
+    Path successPath = new Path(outputDir, SUCCESS_MARKER);
+    ManifestSuccessData successData
+        = loadAndPrintSuccessData(fs, successPath);
+    assertThat(successData.getCommitter())
+        .describedAs("Committer field")
+        .isEqualTo(MANIFEST_COMMITTER_CLASSNAME);
+    assertThat(successData.getFilenames())
+        .describedAs("Files committed")
+        .hasSizeGreaterThanOrEqualTo(minimumFileCount);
+    if (isNotEmpty(jobId)) {
+      assertThat(successData.getJobId())
+          .describedAs("JobID")
+          .isEqualTo(jobId);
+    }
+    return successData;
+  }
+
+  /**
+   * Load in and print a success data manifest.
+   * @param fs filesystem
+   * @param successPath full path to success file.
+   * @return the success data
+   * @throws IOException IO failure
+   */
+  public static ManifestSuccessData loadAndPrintSuccessData(
+      FileSystem fs,
+      Path successPath) throws IOException {
+    LOG.info("Manifest {}", successPath);
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    PrintStream ps = new PrintStream(baos);
+    final ManifestPrinter showManifest = new ManifestPrinter(fs.getConf(), ps);
+    ManifestSuccessData successData = showManifest.loadAndPrintManifest(fs, successPath);
+    LOG.info("{}", baos);
+    return successData;
+  }
+
+  /**
+   * Validate all generated files from the manifest.
+   * All files in the manifest must exist.
+   * If the exclusive flag is set, only those must exist
+   * (ignoring all temp files and everything in the _temporary
+   * dir)
+   * @param fs filesystem
+   * @param destDir dest dir to scan
+   * @param successData manifest
+   * @param exclusive expect exclusive and complete data.
+   * @return the files and their status
+   * @throws IOException IO failure.
+   */
+  public static Map<Path, LocatedFileStatus> validateGeneratedFiles(
+      FileSystem fs,
+      Path destDir,
+      ManifestSuccessData successData,
+      boolean exclusive) throws IOException {
+    Map<Path, LocatedFileStatus> map = new HashMap<>();
+    RemoteIterators.foreach(fs.listFiles(destDir, true),
+        e -> {
+          if (!e.getPath().getName().startsWith("_")) {
+            map.put(e.getPath(), e);
+          }
+        });
+    // map has all files other than temp ones and the success marker
+    // what do we expect
+    final List<Path> expected = filesInManifest(successData);
+
+    // all of those must be found
+    Assertions.assertThat(map.keySet())
+        .describedAs("Files in FS compared to manifest")
+        .containsAll(expected);
+
+    // and if exclusive, that too
+    if (exclusive) {
+      Assertions.assertThat(map.keySet())
+          .describedAs("Files in FS compared to manifest")
+          .containsExactlyInAnyOrderElementsOf(expected);
+    }
+    return map;
+  }
+
+  /**
+   * Given a manifest, get the list of filenames
+   * and convert to paths.
+   * @param successData data
+   * @return the paths.
+   */
+  public static List<Path> filesInManifest(ManifestSuccessData successData) {
+    return successData.getFilenames().stream()
+        .map(AbstractManifestData::unmarshallPath)
+        .collect(Collectors.toList());
+  }
+
+  /**
+   * List a directory/directory tree.
+   * @param fileSystem FS
+   * @param path path
+   * @param recursive do a recursive listing?
+   * @return the number of files found.
+   * @throws IOException failure.
+   */
+  public static long lsR(FileSystem fileSystem, Path path, boolean recursive)
+      throws Exception {
+    if (path == null) {
+      // surfaces when someone calls getParent() on something at the top
+      // of the path
+      LOG.info("Empty path");
+      return 0;
+    }
+    return RemoteIterators.foreach(fileSystem.listFiles(path, recursive),
+        (status) -> LOG.info("{}", status));
+  }
+
+  /**
+   * Assert that a file or dir entry matches the given parameters.
+   * Matching on paths, not strings, helps validate marshalling.
+   * @param fileOrDir file or directory
+   * @param src source path
+   * @param dest dest path
+   * @param l length
+   */
+  static void assertFileEntryMatch(
+      final FileEntry fileOrDir,
+      final Path src,
+      final Path dest,
+      final long l) {
+    String entry = fileOrDir.toString();
+    assertThat(fileOrDir.getSourcePath())
+        .describedAs("Source path of " + entry)
+        .isEqualTo(src);
+    assertThat(fileOrDir.getDestPath())
+        .describedAs("Dest path of " + entry)
+        .isEqualTo(dest);
+    assertThat(fileOrDir.getSize())
+        .describedAs("Size of " + entry)
+        .isEqualTo(l);
+  }
+
+  /**
+   * Assert that a dir entry matches the given parameters.
+   * Matching on paths, not strings, helps validate marshalling.
+   * @param fileOrDir file or directory
+   * @param dest dest path
+   * @param type type
+   */
+  static void assertDirEntryMatch(
+      final DirEntry fileOrDir,
+      final Path dest,
+      final long type) {
+    String entry = fileOrDir.toString();
+    assertThat(fileOrDir.getDestPath())
+        .describedAs("Dest path of " + entry)
+        .isEqualTo(dest);
+    assertThat(fileOrDir.getType())
+        .describedAs("type of " + entry)
+        .isEqualTo(type);
+  }
+
+  /**
+   * Closeable which can be used to safely close writers in
+   * a try-with-resources block..
+   */
+  public static final class CloseWriter<K, V> implements AutoCloseable {
+
+    private final RecordWriter<K, V> writer;
+
+    private final TaskAttemptContext context;
+
+    public CloseWriter(RecordWriter<K, V> writer,
+        TaskAttemptContext context) {
+      this.writer = writer;
+      this.context = context;
+    }
+
+    @Override
+    public void close() {
+      try {
+        writer.close(context);
+      } catch (IOException | InterruptedException e) {
+        LOG.error("When closing {} on context {}",
+            writer, context, e);
+      }
+    }
+  }
+
+  public static final String ATTEMPT_STRING =
+      "attempt_%s_m_%06d_%d";
+
+  /**
+   * Creates a random JobID and then as many tasks
+   * with the specific number of task attempts.
+   */
+  public static final class JobAndTaskIDsForTests {
+
+    /** Job ID; will be created uniquely for each instance. */
+    private final String jobId;
+
+    /**
+     * Store the details as strings; generate
+     * IDs on demand.
+     */
+    private final String[][] taskAttempts;
+
+    /**
+     * Constructor.
+     * @param tasks number of tasks.
+     * @param attempts number of attempts.
+     */
+    public JobAndTaskIDsForTests(int tasks, int attempts) {
+      this(randomJobId(), tasks, attempts);
+    }
+
+    public JobAndTaskIDsForTests(final String jobId,
+        int tasks, int attempts) {
+      this.jobId = jobId;
+      this.taskAttempts = new String[tasks][attempts];
+      for (int i = 0; i < tasks; i++) {
+        for (int j = 0; j < attempts; j++) {
+          String a = String.format(ATTEMPT_STRING,
+              jobId, i, j);
+          this.taskAttempts[i][j] = a;
+        }
+      }
+    }
+
+    /**
+     * Get the job ID.
+     * @return job ID string.
+     */
+    public String getJobId() {
+      return jobId;
+    }
+
+    /**
+     * Get the job ID as the MR type.
+     * @return job ID type.
+     */
+    public JobID getJobIdType() {
+      return getTaskIdType(0).getJobID();
+    }
+
+    /**
+     * Get a task attempt ID.
+     * @param task task index
+     * @param attempt attempt number.
+     * @return the task attempt.
+     */
+    public String getTaskAttempt(int task, int attempt) {
+      return taskAttempts[task][attempt];
+    }
+
+    /**
+     * Get task attempt ID as the MR type.
+     * @param task task index
+     * @param attempt attempt number.
+     * @return the task attempt type
+     */
+    public TaskAttemptID getTaskAttemptIdType(int task, int attempt) {
+      return TaskAttemptID.forName(getTaskAttempt(task, attempt));
+    }
+
+    /**
+     * Get task ID as the MR type.
+     * @param task task index
+     * @return the task ID type
+     */
+    public TaskID getTaskIdType(int task) {
+      return TaskAttemptID.forName(getTaskAttempt(task, 0)).getTaskID();
+    }
+
+    /**
+     * Get task ID as a string.
+     * @param task task index
+     * @return the task ID
+     */
+    public String getTaskId(int task) {
+      return getTaskIdType(task).toString();
+    }
+
+  }
+}

+ 109 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/StubStoreOperations.java

@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.IOException;
+import java.util.NoSuchElementException;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
+import org.apache.hadoop.util.JsonSerialization;
+
+/**
+ * Stub Store operations.
+ * Everything "works" provided you don't look too close.
+ * Files have etags of their filename; if you move a file without changing its
+ * name, the etag is preserved.
+ */
+public class StubStoreOperations extends ManifestStoreOperations {
+
+  @Override
+  public FileStatus getFileStatus(final Path path) throws IOException {
+    return new TaggedFileStatus(0, false, 1, 1024, 0, path, path.getName());
+  }
+
+  @Override
+  public boolean delete(final Path path, final boolean recursive)
+      throws IOException {
+    return true;
+  }
+
+  @Override
+  public boolean mkdirs(final Path path) throws IOException {
+    return true;
+  }
+
+  @Override
+  public boolean renameFile(final Path source, final Path dest)
+      throws IOException {
+    return true;
+  }
+
+  @Override
+  public RemoteIterator<FileStatus> listStatusIterator(final Path path)
+      throws IOException {
+    return new EmptyRemoteIterator<>();
+  }
+
+  @Override
+  public TaskManifest loadTaskManifest(JsonSerialization<TaskManifest> serializer,
+      final FileStatus st) throws IOException {
+    return new TaskManifest();
+  }
+
+  @Override
+  public <T extends AbstractManifestData<T>> void save(T manifestData,
+      final Path path,
+      final boolean overwrite) throws IOException {
+
+  }
+
+  @Override
+  public void close() throws IOException {
+
+  }
+
+  @Override
+  public boolean storePreservesEtagsThroughRenames(final Path path) {
+    return true;
+  }
+
+  /**
+   * Always empty rempte iterator.
+   * @param <T> type of iterator.
+   */
+  private static final class EmptyRemoteIterator<T>
+      implements RemoteIterator<T> {
+
+    @Override
+    public boolean hasNext() throws IOException {
+      return false;
+    }
+
+    @Override
+    public T next() throws IOException {
+      throw new NoSuchElementException();
+    }
+  }
+
+}

+ 54 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TaggedFileStatus.java

@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.EtagSource;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * File Status with etag derived from the filename, if not explicitly set.
+ */
+public final class TaggedFileStatus extends FileStatus implements EtagSource {
+
+  private final String etag;
+
+  public TaggedFileStatus(final long length,
+      final boolean isdir,
+      final int blockReplication,
+      final long blocksize,
+      final long modificationTime,
+      final Path path,
+      final String etag) {
+    super(length, isdir, blockReplication, blocksize, modificationTime, path);
+    this.etag = etag;
+  }
+
+  public TaggedFileStatus(final FileStatus other, final String etag) throws IOException {
+    super(other);
+    this.etag = etag;
+  }
+
+  @Override
+  public String getEtag() {
+    return etag;
+  }
+}

+ 142 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCleanupStage.java

@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.util.List;
+
+import org.junit.Test;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+
+
+/**
+ * Test the cleanup stage.
+ */
+public class TestCleanupStage extends AbstractManifestCommitterTest {
+
+  /**
+   * Number of task attempts to create. Manifests are created and written
+   * as well as test dirs, but no actual files.
+   */
+  protected static final int TASK_ATTEMPT_COUNT = 10;
+
+  /**
+   * How many delete calls for the root job delete?
+   */
+  protected static final int ROOT_DELETE_COUNT = 1;
+
+  /**
+   * Tocal invocation count for a successful parallel delete job.
+   */
+  protected static final int PARALLEL_DELETE_COUNT =
+      TASK_ATTEMPT_COUNT + ROOT_DELETE_COUNT;
+
+  /**
+   * Fault Injection.
+   */
+  private UnreliableManifestStoreOperations failures;
+
+  /**
+   * Manifests created.
+   */
+  private List<TaskManifest> manifests;
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    failures = new UnreliableManifestStoreOperations(
+        createManifestStoreOperations());
+    setStoreOperations(failures);
+    Path destDir = methodPath();
+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
+    setJobStageConfig(stageConfig);
+    new SetupJobStage(stageConfig).apply(true);
+
+    // lots of tasks, but don't bother creating mock files.
+    manifests = executeTaskAttempts(TASK_ATTEMPT_COUNT, 0);
+  }
+
+  @Test
+  public void testCleanupInParallelHealthy() throws Throwable {
+    describe("parallel cleanup of TA dirs.");
+    cleanup(true, true, false,
+        CleanupJobStage.Outcome.PARALLEL_DELETE,
+        PARALLEL_DELETE_COUNT);
+    verifyJobDirsCleanedUp();
+  }
+
+  @Test
+  public void testCleanupSingletonHealthy() throws Throwable {
+    describe("Cleanup with a single delete. Not the default; would be best on HDFS");
+
+    cleanup(true, false, false,
+        CleanupJobStage.Outcome.DELETED, ROOT_DELETE_COUNT);
+    verifyJobDirsCleanedUp();
+  }
+
+  @Test
+  public void testCleanupNoDir() throws Throwable {
+    describe("parallel cleanup MUST not fail if there's no dir");
+    // first do the cleanup
+    cleanup(true, true, false,
+        CleanupJobStage.Outcome.PARALLEL_DELETE, PARALLEL_DELETE_COUNT);
+
+    // now expect cleanup by single delete still works
+    // the delete count is 0 as pre check skips it
+    cleanup(true, false, false,
+        CleanupJobStage.Outcome.NOTHING_TO_CLEAN_UP, 0);
+
+    // if skipped, that happens first
+    cleanup(false, true, false,
+        CleanupJobStage.Outcome.DISABLED, 0);
+  }
+
+  @Test
+  public void testFailureInParallelDelete() throws Throwable {
+    describe("A parallel delete fails, but the base delete works");
+
+    // pick one of the manifests
+    TaskManifest manifest = manifests.get(4);
+    Path taPath = new Path(manifest.getTaskAttemptDir());
+    failures.addDeletePathToFail(taPath);
+    cleanup(true, true, false,
+        CleanupJobStage.Outcome.DELETED, PARALLEL_DELETE_COUNT);
+  }
+
+  /**
+   * If there's no job task attempt subdir then the list of it will raise
+   * and FNFE; this MUST be caught and the base delete executed.
+   */
+  @Test
+  public void testParallelDeleteNoTaskAttemptDir() throws Throwable {
+    describe("Execute parallel delete where" +
+        " the job task directory does not exist");
+    StageConfig stageConfig = getJobStageConfig();
+    // TA dir doesn't exist, so listing will fail.
+    failures.addPathNotFound(stageConfig.getJobAttemptTaskSubDir());
+    cleanup(true, true, false,
+        CleanupJobStage.Outcome.DELETED, ROOT_DELETE_COUNT);
+  }
+
+}

+ 126 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCommitTaskStage.java

@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.FileNotFoundException;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitTaskStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Test committing a task.
+ */
+public class TestCommitTaskStage extends AbstractManifestCommitterTest {
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+
+    Path destDir = methodPath();
+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
+    setJobStageConfig(stageConfig);
+    new SetupJobStage(stageConfig).apply(true);
+  }
+
+  @Test
+  public void testCommitMissingDirectory() throws Throwable {
+
+    String tid = String.format("task_%03d", 1);
+    String taskAttemptId = String.format("%s_%02d",
+        tid, 1);
+    StageConfig taskStageConfig = createTaskStageConfig(JOB1, tid,
+        taskAttemptId);
+
+    // the task attempt dir does not exist
+    Path taDir = taskStageConfig.getTaskAttemptDir();
+    assertPathDoesNotExist("task attempt path", taDir);
+
+    // so the task commit fails
+    intercept(FileNotFoundException.class, () ->
+        new CommitTaskStage(taskStageConfig).apply(null));
+  }
+
+  @Test
+  public void testCommitEmptyDirectory() throws Throwable {
+
+    describe("Commit an empty directory as task then job");
+    String tid = String.format("task_%03d", 2);
+    String taskAttemptId = String.format("%s_%02d",
+        tid, 1);
+    StageConfig taskStageConfig = createTaskStageConfig(JOB1, tid,
+        taskAttemptId);
+
+    // set up the task
+    new SetupTaskStage(taskStageConfig).apply("setup");
+
+    CommitTaskStage.Result result = new CommitTaskStage(taskStageConfig)
+        .apply(null);
+
+    final TaskManifest manifest = result.getTaskManifest();
+    Assertions.assertThat(manifest.getDestDirectories())
+        .as("directories to create")
+        .isEmpty();
+    Assertions.assertThat(manifest.getFilesToCommit())
+        .as("files to commit")
+        .isEmpty();
+
+    final Path path = result.getPath();
+
+    final String manifestBody = readText(path);
+
+    LOG.info("manifest at {} of length {}:\n{}",
+        path, manifestBody.length(), manifestBody);
+
+    // now commit
+    final CommitJobStage.Result outcome = new CommitJobStage(getJobStageConfig())
+        .apply(new CommitJobStage.Arguments(
+            true, true, null,
+            new CleanupJobStage.Arguments(
+                OP_STAGE_JOB_CLEANUP,
+                true,
+                true,
+                false
+            )));
+
+    // review success file
+    final Path successPath = outcome.getSuccessPath();
+    String successBody = readText(successPath);
+    LOG.info("successBody at {} of length {}:\n{}",
+        successPath, successBody.length(), successBody);
+
+    final ManifestSuccessData successData = outcome.getJobSuccessData();
+    Assertions.assertThat(successData.getFilenames())
+        .as("Filenames in _SUCCESS")
+        .isEmpty();
+  }
+
+}

+ 307 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCreateOutputDirectoriesStage.java

@@ -0,0 +1,307 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.stream.Collectors;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.EntryStatus;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CreateOutputDirectoriesStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+import org.apache.hadoop.util.Lists;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.lookupCounterStatistic;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_GET_FILE_STATUS;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_IS_FILE;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_MKDIRS;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_FAILURES;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_DELETE_FILE_UNDER_DESTINATION;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_PREPARE_DIR_ANCESTORS;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Test directory creation.
+ * As the directory creation phase relies on input from the task manifest to
+ * determine which directories to explicitly create, delete files at
+ * etc, these tests build up manifests and assert that the output
+ * of the directory creation stage matches that of the combination
+ * of the manifest and the filesystem state.
+ */
+public class TestCreateOutputDirectoriesStage extends AbstractManifestCommitterTest {
+
+  /**
+   * Deep tree width, subclasses (including in external projects)
+   * may change.
+   */
+  protected static final int DEEP_TREE_WIDTH = 4;
+
+  /**
+   * The number of directories created in test setup; this must be
+   * added to all assertions of the value of OP_MKDIRS.
+   */
+  private static final int DIRECTORIES_CREATED_IN_SETUP = 2;
+
+  private Path destDir;
+  private CreateOutputDirectoriesStage mkdirStage;
+  private StageConfig stageConfig;
+  private IOStatisticsStore iostats;
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    destDir = methodPath();
+    // clean up dest dir completely
+    destDir.getFileSystem(getConfiguration()).delete(destDir, true);
+    setStoreOperations(createManifestStoreOperations());
+    stageConfig = createStageConfigForJob(JOB1, destDir)
+        .withDeleteTargetPaths(true);
+    setJobStageConfig(stageConfig);
+    // creates the job directories.
+    new SetupJobStage(stageConfig).apply(true);
+    mkdirStage = new CreateOutputDirectoriesStage(stageConfig);
+    iostats = stageConfig.getIOStatistics();
+    // assert original count of dirs created == 2 : job and task manifest
+    verifyStatisticCounterValue(iostats, OP_MKDIRS,
+        DIRECTORIES_CREATED_IN_SETUP);
+    // reset the value to simplify future work
+    iostats.getCounterReference(OP_MKDIRS).set(0);
+  }
+
+  @Test
+  public void testPrepareSomeDirs() throws Throwable {
+
+    final long initialFileStatusCount = lookupCounterStatistic(iostats, OP_GET_FILE_STATUS);
+    final int dirCount = 8;
+    final List<Path> dirs = subpaths(destDir, dirCount);
+    final List<DirEntry> dirEntries = dirEntries(dirs, 1, EntryStatus.not_found);
+
+    // two manifests with duplicate entries
+    final List<TaskManifest> manifests = Lists.newArrayList(
+        manifestWithDirsToCreate(dirEntries),
+        manifestWithDirsToCreate(dirEntries));
+    final CreateOutputDirectoriesStage.Result result = mkdirStage.apply(manifests);
+    Assertions.assertThat(result.getCreatedDirectories())
+        .describedAs("output of %s", mkdirStage)
+        .containsExactlyInAnyOrderElementsOf(dirs);
+
+    LOG.info("Job Statistics\n{}", ioStatisticsToPrettyString(iostats));
+
+    // now dirCount new dirs are added.
+    verifyStatisticCounterValue(iostats, OP_MKDIRS, dirCount);
+
+    // now rerun the same preparation sequence, but this
+    // time declare that the directories exist (as they do)
+    final CreateOutputDirectoriesStage s2 =
+        new CreateOutputDirectoriesStage(stageConfig);
+    final CreateOutputDirectoriesStage.Result r2 = s2.apply(
+        Lists.newArrayList(
+            manifestWithDirsToCreate(dirEntries(dirs, 1, EntryStatus.dir))));
+
+    // no directories are now created.
+    Assertions.assertThat(r2.getCreatedDirectories())
+        .describedAs("output of %s", s2)
+        .isEmpty();
+    LOG.info("Job Statistics after second pass\n{}", ioStatisticsToPrettyString(iostats));
+
+    // second run probed no dest dirs
+    verifyStatisticCounterValue(iostats, OP_GET_FILE_STATUS, initialFileStatusCount);
+    // and no new mkdir calls were made
+    verifyStatisticCounterValue(iostats, OP_MKDIRS, dirCount);
+    verifyStatisticCounterValue(iostats, OP_DELETE_FILE_UNDER_DESTINATION, 0);
+    verifyStatisticCounterValue(iostats, OP_IS_FILE, 0);
+  }
+
+  /**
+   * Given a list of paths, build a list of DirEntry entries.
+   * @param paths list of paths
+   * @param level Level in the treewalk.
+   * @param entryStatus status of dirs
+   * @return list of entries with  the given level and entry status.
+   */
+  protected List<DirEntry> dirEntries(Collection<Path> paths,
+      int level,
+      EntryStatus entryStatus) {
+    return paths.stream()
+        .map(p -> DirEntry.dirEntry(p, entryStatus, level))
+        .collect(Collectors.toList());
+  }
+
+  /**
+   * Create a manifest with the list of directory entries added.
+   * Job commit requires the entries to have been probed for, and
+   * for the entire tree under the dest path to be included.
+   * @param dirEntries list of directory entries.
+   * @return the manifest.
+   */
+  protected TaskManifest manifestWithDirsToCreate(List<DirEntry> dirEntries) {
+    final TaskManifest taskManifest = new TaskManifest();
+    taskManifest.getDestDirectories().addAll(dirEntries);
+    return taskManifest;
+  }
+
+  /**
+   * Assert the directory map status of a path.
+   * @param result stage result
+   * @param path path to look up
+   * @param expected expected value.
+   */
+  private static void assertDirMapStatus(
+      CreateOutputDirectoriesStage.Result result,
+      Path path,
+      CreateOutputDirectoriesStage.DirMapState expected) {
+    Assertions.assertThat(result.getDirMap())
+        .describedAs("Directory Map entry for %s", path)
+        .isNotNull()
+        .containsKey(path)
+        .containsEntry(path, expected);
+  }
+
+  /**
+   * Prepare a deep tree {@code c ^ 3} of entries.
+   * Make one of the parent dirs a file.
+   *
+   * From a test-purity perspective, this should
+   * be separate tests. But attempting
+   * operations in the same test cases spreads the
+   * directory setup costs across both, rather than
+   * duplicating it.
+   */
+  @Test
+  public void testPrepareDirtyTree() throws Throwable {
+
+    // build the lists of paths for the different levels
+    final int c = getDeepTreeWidth();
+    final List<Path> level1 = subpaths(destDir, c);
+    final List<Path> level2 = level1.stream().flatMap(p ->
+            subpaths(p, c).stream())
+        .collect(Collectors.toList());
+    final List<Path> level3 = level2.stream().flatMap(p ->
+            subpaths(p, c).stream())
+        .collect(Collectors.toList());
+    // manifest dir entry list contains all levels > 0
+    // adding them out of order verifies sorting takes place
+    // before the merge routine which is intended to strip
+    // out parent dirs
+    final List<DirEntry> directories = new ArrayList<>();
+    final List<DirEntry> l1 = dirEntries(level1, 1, EntryStatus.not_found);
+    directories.addAll(l1);
+    final List<DirEntry> l3 = dirEntries(level3, 3, EntryStatus.not_found);
+    directories.addAll(l3);
+    final List<DirEntry> l2 = dirEntries(level2, 2, EntryStatus.not_found);
+    directories.addAll(l2);
+
+    // one of the level 0 paths is going to be a file
+    final DirEntry parentIsFile = l1.get(1);
+    // one entry has a dir already
+    final DirEntry parentIsDir = l2.get(0);
+    // and one of the dest dirs is a file.
+    final DirEntry leafIsFile = l3.get(0);
+
+    // prepare the output
+    CompletableFuture.allOf(
+        asyncPut(parentIsFile.getDestPath(), NO_DATA),
+        asyncPut(leafIsFile.getDestPath(), NO_DATA),
+        asyncMkdir(parentIsDir.getDestPath()))
+        .join();
+
+    // patch the entries, which, as they are references
+    // into the lists, updates the values there.
+    parentIsFile.setStatus(EntryStatus.file);
+    parentIsDir.setStatus(EntryStatus.dir);
+    leafIsFile.setStatus(EntryStatus.file);
+
+    final List<TaskManifest> manifests = Lists.newArrayList(
+        manifestWithDirsToCreate(directories));
+
+    // first attempt will succeed.
+    final CreateOutputDirectoriesStage.Result result =
+        mkdirStage.apply(manifests);
+
+    LOG.info("Job Statistics\n{}", ioStatisticsToPrettyString(iostats));
+
+    assertDirMapStatus(result, leafIsFile.getDestPath(),
+        CreateOutputDirectoriesStage.DirMapState.fileNowDeleted);
+
+    // for the parent dir, all is good
+    assertDirMapStatus(result, parentIsFile.getDestPath(),
+        CreateOutputDirectoriesStage.DirMapState.fileNowDeleted);
+    Assertions.assertThat(result.getCreatedDirectories())
+        .describedAs("output of %s", mkdirStage)
+        .containsExactlyInAnyOrderElementsOf(level3);
+    verifyStatisticCounterValue(iostats, OP_MKDIRS, level3.size());
+    // do a rerun. where the directory setup will fail because
+    // a directory is present where the manifest says there is
+    // a file.
+    CreateOutputDirectoriesStage attempt2 =
+        new CreateOutputDirectoriesStage(
+            createStageConfigForJob(JOB1, destDir)
+                .withDeleteTargetPaths(true));
+    // attempt will fail because one of the entries marked as
+    // a file to delete is now a non-empty directory
+    LOG.info("Executing failing attempt to create the directories");
+    intercept(IOException.class, () -> attempt2.apply(manifests));
+    verifyStatisticCounterValue(iostats, OP_PREPARE_DIR_ANCESTORS + SUFFIX_FAILURES, 1);
+    verifyStatisticCounterValue(iostats, OP_DELETE + SUFFIX_FAILURES, 1);
+
+    // build a new directory list where everything is declared a directory;
+    // no dirs will be created this time as they all exist.
+    final List<DirEntry> directories3 = new ArrayList<>();
+    directories3.addAll(dirEntries(level1, 1, EntryStatus.dir));
+    directories3.addAll(dirEntries(level2, 2, EntryStatus.dir));
+    directories3.addAll(dirEntries(level3, 3, EntryStatus.dir));
+
+    final List<TaskManifest> manifests3 = Lists.newArrayList(
+        manifestWithDirsToCreate(directories3));
+    CreateOutputDirectoriesStage attempt3 =
+        new CreateOutputDirectoriesStage(
+            createStageConfigForJob(JOB1, destDir)
+                .withDeleteTargetPaths(true));
+    final CreateOutputDirectoriesStage.Result r3 =
+        attempt3.apply(manifests3);
+    assertDirMapStatus(r3, leafIsFile.getDestPath(),
+        CreateOutputDirectoriesStage.DirMapState.dirFoundInStore);
+    Assertions.assertThat(r3.getCreatedDirectories())
+        .describedAs("created directories")
+        .isEmpty();
+  }
+
+  /**
+   * Get the width of the deep tree; subclasses may tune for test performance, though
+   * a wide one is more realistic of real jobs.
+   * @return number of subdirs to create at each level. Must be at least 2
+   */
+  protected int getDeepTreeWidth() {
+    return DEEP_TREE_WIDTH;
+  }
+}

+ 601 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestJobThroughManifestCommitter.java

@@ -0,0 +1,601 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.assertj.core.api.Assertions;
+import org.junit.FixMethodOrder;
+import org.junit.Test;
+import org.junit.runners.MethodSorters;
+
+import org.apache.hadoop.fs.FileAlreadyExistsException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.OutputValidationException;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.AbortTaskStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitTaskStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.LoadManifestsStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.ValidateRenamedFilesStage;
+import org.apache.hadoop.net.NetUtils;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.rm;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyPathExists;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertThatStatisticCounter;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.JOB_ID_SOURCE_MAPREDUCE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_CLASSNAME;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_BYTES_COMMITTED_COUNT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_FILES_COMMITTED_COUNT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.loadAndPrintSuccessData;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.validateGeneratedFiles;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.PRINCIPAL;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.STAGE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestPathForTask;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage.DISABLED;
+import static org.apache.hadoop.security.UserGroupInformation.getCurrentUser;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Test IO through the stages.
+ * This mimics the workflow of a job with two tasks,
+ * the first task has two attempts where the second attempt
+ * is committed after the first attempt (simulating the
+ * failure-during-task-commit which the v2 algorithm cannot
+ * handle).
+ *
+ * The test is ordered and the output dir is not cleaned up
+ * after each test case.
+ * The last test case MUST perform the cleanup.
+ */
+@FixMethodOrder(MethodSorters.NAME_ASCENDING)
+public class TestJobThroughManifestCommitter
+    extends AbstractManifestCommitterTest {
+
+  /** Destination directory. */
+  private Path destDir;
+
+  /** directory names for the tests. */
+  private ManifestCommitterSupport.AttemptDirectories dirs;
+
+  /**
+   * To ensure that the local FS has a shared root path, this is static.
+   */
+  @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized")
+  private static Path sharedTestRoot = null;
+
+  /**
+   * Job ID.
+   */
+  private String jobId;
+
+  /**
+   * Task 0 attempt 0 ID.
+   */
+  private String taskAttempt00;
+
+  /**
+   * Task 0 attempt 1 ID.
+   */
+  private String taskAttempt01;
+
+  /**
+   * Task 1 attempt 0 ID.
+   */
+  private String taskAttempt10;
+
+  /**
+   * Task 1 attempt 1 ID.
+   */
+  private String taskAttempt11;
+
+  /**
+   * Stage config for TA00.
+   */
+  private StageConfig ta00Config;
+
+  /**
+   * Stage config for TA01.
+   */
+  private StageConfig ta01Config;
+
+  /**
+   * Stage config for TA10.
+   */
+  private StageConfig ta10Config;
+
+  /**
+   * Stage config for TA11.
+   */
+  private StageConfig ta11Config;
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    taskAttempt00 = TASK_IDS.getTaskAttempt(TASK0, TA0);
+    taskAttempt01 = TASK_IDS.getTaskAttempt(TASK0, TA1);
+    taskAttempt10 = TASK_IDS.getTaskAttempt(TASK1, TA0);
+    taskAttempt11 = TASK_IDS.getTaskAttempt(TASK1, TA1);
+    setSharedPath(path("TestJobThroughManifestCommitter"));
+    // add a dir with a space in.
+    destDir = new Path(sharedTestRoot, "out put");
+    jobId = TASK_IDS.getJobId();
+    // then the specific path underneath that for the attempt.
+    dirs = new ManifestCommitterSupport.AttemptDirectories(destDir,
+        jobId, 1);
+
+    // config for job attempt 1, task 00
+    setJobStageConfig(createStageConfigForJob(JOB1, destDir).build());
+    ta00Config = createStageConfig(JOB1, TASK0, TA0, destDir).build();
+    ta01Config = createStageConfig(JOB1, TASK0, TA1, destDir).build();
+    ta10Config = createStageConfig(JOB1, TASK1, TA0, destDir).build();
+    ta11Config = createStageConfig(JOB1, TASK1, TA1, destDir).build();
+  }
+
+  /**
+   * Test dir deletion is removed from test case teardown so the
+   * subsequent tests see the output.
+   * @throws IOException failure
+   */
+  @Override
+  protected void deleteTestDirInTeardown() throws IOException {
+    /* no-op */
+  }
+
+  /**
+   * Override point and something to turn on/off when exploring what manifests look like.
+   * Stores where storage is billed MUST enable this.
+   * @return true if, at the end of the run, the test dir should be deleted.
+   */
+  protected boolean shouldDeleteTestRootAtEndOfTestRun() {
+    return false;
+  }
+
+  /**
+   * Invoke this to clean up the test directories.
+   */
+  private void deleteSharedTestRoot() throws IOException {
+    describe("Deleting shared test root %s", sharedTestRoot);
+
+    rm(getFileSystem(), sharedTestRoot, true, false);
+  }
+
+  /**
+   * Set the shared test root if not already set.
+   * @param path path to set.
+   * @return true if the path was set
+   */
+  private static synchronized boolean setSharedPath(final Path path) {
+    if (sharedTestRoot == null) {
+      // set this as needed
+      LOG.info("Set shared path to {}", path);
+      sharedTestRoot = path;
+      return true;
+    }
+    return false;
+  }
+
+  @Test
+  public void test_0000_setupTestDir() throws Throwable {
+    describe("always ensure directory setup is empty");
+    deleteSharedTestRoot();
+  }
+
+  @Test
+  public void test_0100_setupJobStage() throws Throwable {
+    describe("Set up a job");
+    verifyPath("Job attempt dir",
+        dirs.getJobAttemptDir(),
+        new SetupJobStage(getJobStageConfig()).apply(true));
+  }
+
+  /**
+   * And the check that the stage worked.
+   * @throws IOException failure.
+   */
+  private void verifyJobSetupCompleted() throws IOException {
+    assertPathExists("Job attempt dir from test_0100", dirs.getJobAttemptDir());
+  }
+
+  @Test
+  public void test_0110_setupJobOnlyAllowedOnce() throws Throwable {
+    describe("a second creation of a job attempt must fail");
+    verifyJobSetupCompleted();
+    intercept(FileAlreadyExistsException.class, "", () ->
+        new SetupJobStage(getJobStageConfig()).apply(true));
+    // job is still there
+    assertPathExists("Job attempt dir", dirs.getJobAttemptDir());
+  }
+
+  @Test
+  public void test_0120_setupJobNewAttemptNumber() throws Throwable {
+    describe("Creating a new job attempt is supported");
+    verifyJobSetupCompleted();
+    Path path = pathMustExist("Job attempt 2 dir",
+        new SetupJobStage(createStageConfig(2, -1, 0, destDir))
+            .apply(false));
+    Assertions.assertThat(path)
+        .describedAs("Stage created path")
+        .isNotEqualTo(dirs.getJobAttemptDir());
+  }
+
+  @Test
+  public void test_0200_setupTask00() throws Throwable {
+    describe("Set up a task; job must have been set up first");
+    verifyJobSetupCompleted();
+    verifyPath("Task attempt 00",
+        dirs.getTaskAttemptPath(taskAttempt00),
+        new SetupTaskStage(ta00Config).apply("first"));
+  }
+
+  /**
+   * Verify TA00 is set up.
+   */
+  private void verifyTaskAttempt00SetUp() throws IOException {
+    pathMustExist("Dir from taskAttempt00 setup",
+        dirs.getTaskAttemptPath(taskAttempt00));
+  }
+
+  @Test
+  public void test_0210_setupTask00OnlyAllowedOnce() throws Throwable {
+    describe("Second attempt to set up task00 must fail.");
+    verifyTaskAttempt00SetUp();
+    intercept(FileAlreadyExistsException.class, "second", () ->
+        new SetupTaskStage(ta00Config).apply("second"));
+  }
+
+  @Test
+  public void test_0220_setupTask01() throws Throwable {
+    describe("Setup task attempt 01");
+    verifyTaskAttempt00SetUp();
+    verifyPath("Task attempt 01",
+        dirs.getTaskAttemptPath(taskAttempt01),
+        new SetupTaskStage(ta01Config)
+            .apply("01"));
+  }
+
+  @Test
+  public void test_0230_setupTask10() throws Throwable {
+    describe("Setup task attempt 10");
+    verifyJobSetupCompleted();
+    verifyPath("Task attempt 10",
+        dirs.getTaskAttemptPath(taskAttempt10),
+        new SetupTaskStage(ta10Config)
+            .apply("10"));
+  }
+
+  /**
+   * Setup then abort task 11 before creating any files;
+   * verify that commit fails before creating a manifest file.
+   */
+  @Test
+  public void test_0240_setupThenAbortTask11() throws Throwable {
+    describe("Setup then abort task attempt 11");
+    verifyJobSetupCompleted();
+    Path ta11Path = new SetupTaskStage(ta11Config).apply("11");
+    Path deletedDir = new AbortTaskStage(ta11Config).apply(false);
+    Assertions.assertThat(ta11Path)
+        .isEqualTo(deletedDir);
+    assertPathDoesNotExist("aborted directory", ta11Path);
+    // execute will fail as there's no dir to list.
+    intercept(FileNotFoundException.class, () ->
+        new CommitTaskStage(ta11Config).apply(null));
+    assertPathDoesNotExist("task manifest",
+        manifestPathForTask(dirs.getTaskManifestDir(),
+            TASK_IDS.getTaskId(TASK1)));
+  }
+
+  /**
+   * Execute TA01 by generating a lot of files in its directory
+   * then committing the task attempt.
+   * The manifest at the task path (i.e. the record of which attempt's
+   * output is to be used) MUST now have been generated by this TA.
+   */
+  @Test
+  public void test_0300_executeTask00() throws Throwable {
+    describe("Create the files for Task 00, then commit the task");
+    List<Path> files = createFilesOrDirs(dirs.getTaskAttemptPath(taskAttempt00),
+        "part-00", getExecutorService(),
+        DEPTH, WIDTH, FILES_PER_DIRECTORY, false);
+    // saves the task manifest to the job dir
+    CommitTaskStage.Result result = new CommitTaskStage(ta00Config)
+        .apply(null);
+    verifyPathExists(getFileSystem(), "manifest",
+        result.getPath());
+
+    TaskManifest manifest = result.getTaskManifest();
+    manifest.validate();
+    // clear the IOStats to reduce the size of the printed JSON.
+    manifest.setIOStatistics(null);
+    LOG.info("Task Manifest {}", manifest.toJson());
+    validateTaskAttemptManifest(this.taskAttempt00, files, manifest);
+  }
+
+  /**
+   * Validate the manifest of a task attempt.
+   * @param attemptId attempt ID
+   * @param files files which were created.
+   * @param manifest manifest
+   * @throws IOException IO problem
+   */
+  protected void validateTaskAttemptManifest(
+      String attemptId,
+      List<Path> files,
+      TaskManifest manifest) throws IOException {
+
+    verifyManifestTaskAttemptID(manifest, attemptId);
+
+    // validate the manifest
+    verifyManifestFilesMatch(manifest, files);
+  }
+
+  /**
+   * Execute TA01 by generating a lot of files in its directory
+   * then committing the task attempt.
+   * The manifest at the task path (i.e. the record of which attempt's
+   * output is to be used) MUST now have been generated by this TA.
+   * Any existing manifest will have been overwritten.
+   */
+  @Test
+  public void test_0310_executeTask01() throws Throwable {
+    describe("Create the files for Task 01, then commit the task");
+    List<Path> files = createFilesOrDirs(dirs.getTaskAttemptPath(taskAttempt01),
+        "part-00", getExecutorService(),
+        DEPTH, WIDTH, FILES_PER_DIRECTORY, false);
+    // saves the task manifest to the job dir
+    CommitTaskStage.Result result = new CommitTaskStage(ta01Config)
+        .apply(null);
+    Path manifestPath = verifyPathExists(getFileSystem(), "manifest",
+        result.getPath()).getPath();
+
+    // load the manifest from the FS, not the return value,
+    // so we can verify that last task to commit wins.
+    TaskManifest manifest = TaskManifest.load(getFileSystem(), manifestPath);
+    manifest.validate();
+    // clear the IOStats to reduce the size of the printed JSON.
+    manifest.setIOStatistics(null);
+    LOG.info("Task Manifest {}", manifest.toJson());
+
+    validateTaskAttemptManifest(taskAttempt01, files, manifest);
+
+  }
+
+  /**
+   * Second task writes to more directories, but fewer files per dir.
+   * This ensures that there will dirs here which aren't in the first
+   * attempt.
+   */
+  @Test
+  public void test_0320_executeTask10() throws Throwable {
+    describe("Create the files for Task 10, then commit the task");
+    List<Path> files = createFilesOrDirs(
+        dirs.getTaskAttemptPath(ta10Config.getTaskAttemptId()),
+        "part-01", getExecutorService(),
+        DEPTH, WIDTH + 1, FILES_PER_DIRECTORY - 1, false);
+    // saves the task manifest to the job dir
+    CommitTaskStage.Result result = new CommitTaskStage(ta10Config)
+        .apply(null);
+    TaskManifest manifest = result.getTaskManifest();
+    validateTaskAttemptManifest(taskAttempt10, files, manifest);
+  }
+
+  @Test
+  public void test_0340_setupThenAbortTask11() throws Throwable {
+    describe("Setup then abort task attempt 11");
+    Path ta11Path = new SetupTaskStage(ta11Config).apply("11");
+    createFilesOrDirs(
+        ta11Path,
+        "part-01", getExecutorService(),
+        2, 1, 1, false);
+
+    new AbortTaskStage(ta11Config).apply(false);
+    assertPathDoesNotExist("aborted directory", ta11Path);
+    // execute will fail as there's no dir to list.
+    intercept(FileNotFoundException.class, () ->
+        new CommitTaskStage(ta11Config).apply(null));
+
+    // and the manifest MUST be unchanged from the previous stage
+    Path manifestPathForTask1 = manifestPathForTask(dirs.getTaskManifestDir(),
+        TASK_IDS.getTaskId(TASK1));
+    verifyManifestTaskAttemptID(
+        TaskManifest.load(getFileSystem(), manifestPathForTask1),
+        taskAttempt10);
+
+  }
+
+  /**
+   * Load all the committed manifests, which must be TA01 (last of
+   * task 0 to commit) and TA10.
+   */
+  @Test
+  public void test_0400_loadManifests() throws Throwable {
+    describe("Load all manifests; committed must be TA01 and TA10");
+    LoadManifestsStage.Result result
+        = new LoadManifestsStage(getJobStageConfig()).apply(true);
+    String summary = result.getSummary().toString();
+    LOG.info("Manifest summary {}", summary);
+    List<TaskManifest> manifests = result.getManifests();
+    Assertions.assertThat(manifests)
+        .describedAs("Loaded manifests in %s", summary)
+        .hasSize(2);
+    Map<String, TaskManifest> manifestMap = toMap(manifests);
+    verifyManifestTaskAttemptID(
+        manifestMap.get(taskAttempt01), taskAttempt01);
+    verifyManifestTaskAttemptID(
+        manifestMap.get(taskAttempt10), taskAttempt10);
+  }
+
+  @Test
+  public void test_0410_commitJob() throws Throwable {
+    describe("Commit the job");
+    CommitJobStage stage = new CommitJobStage(getJobStageConfig());
+    stage.apply(new CommitJobStage.Arguments(true, false, null, DISABLED));
+  }
+
+  /**
+   * Validate that the job output is good by invoking the
+   * {@link ValidateRenamedFilesStage} stage to
+   * validate all the manifests.
+   */
+  @Test
+  public void test_0420_validateJob() throws Throwable {
+    describe("Validate the output of the job through the validation"
+        + " stage");
+
+
+    // load in the success data.
+    ManifestSuccessData successData = loadAndPrintSuccessData(
+        getFileSystem(),
+        getJobStageConfig().getJobSuccessMarkerPath());
+
+    // load manifests stage will load all the task manifests again
+    List<TaskManifest> manifests = new LoadManifestsStage(getJobStageConfig())
+        .apply(true).getManifests();
+    // Now verify their files exist, returning the list of renamed files.
+    List<String> committedFiles = new ValidateRenamedFilesStage(getJobStageConfig())
+        .apply(manifests)
+        .stream().map(FileEntry::getDest)
+        .collect(Collectors.toList());
+
+    // verify that the list of committed files also matches
+    // that in the _SUCCESS file
+    // note: there's a limit to the #of files in the SUCCESS file
+    // to stop writing it slowing down jobs; therefore we don't
+    // make a simple "all must match check
+    Assertions.assertThat(committedFiles)
+        .containsAll(successData.getFilenames());
+
+    // now patch one of the manifest files by editing an entry
+    FileEntry entry = manifests.get(0).getFilesToCommit().get(0);
+    // no longer exists.
+    String oldName = entry.getDest();
+    String newName = oldName + ".missing";
+    entry.setDest(newName);
+
+    // validation will now fail
+    intercept(OutputValidationException.class, ".missing", () ->
+        new ValidateRenamedFilesStage(getJobStageConfig())
+            .apply(manifests));
+
+    // restore the name, but change the size
+    entry.setDest(oldName);
+    entry.setSize(128_000_000);
+    intercept(OutputValidationException.class, () ->
+        new ValidateRenamedFilesStage(getJobStageConfig())
+            .apply(manifests));
+  }
+
+  @Test
+  public void test_0430_validateStatistics() throws Throwable {
+    // load in the success data.
+    ManifestSuccessData successData = ManifestSuccessData.load(
+        getFileSystem(),
+        getJobStageConfig().getJobSuccessMarkerPath());
+    String json = successData.toJson();
+    LOG.info("Success data is {}", json);
+    Assertions.assertThat(successData)
+        .describedAs("Manifest " + json)
+        .returns(NetUtils.getLocalHostname(),
+            ManifestSuccessData::getHostname)
+        .returns(MANIFEST_COMMITTER_CLASSNAME,
+            ManifestSuccessData::getCommitter)
+        .returns(jobId,
+            ManifestSuccessData::getJobId)
+        .returns(true,
+            ManifestSuccessData::getSuccess)
+        .returns(JOB_ID_SOURCE_MAPREDUCE,
+            ManifestSuccessData::getJobIdSource);
+    // diagnostics
+    Assertions.assertThat(successData.getDiagnostics())
+        .containsEntry(PRINCIPAL,
+            getCurrentUser().getShortUserName())
+        .containsEntry(STAGE, OP_STAGE_JOB_COMMIT);
+
+    // and stats
+    IOStatisticsSnapshot iostats = successData.getIOStatistics();
+
+    int files = successData.getFilenames().size();
+    verifyStatisticCounterValue(iostats,
+        OP_STAGE_JOB_COMMIT, 1);
+    assertThatStatisticCounter(iostats,
+        COMMITTER_FILES_COMMITTED_COUNT)
+        .isGreaterThanOrEqualTo(files);
+    Long totalFiles = iostats.counters().get(COMMITTER_FILES_COMMITTED_COUNT);
+    verifyStatisticCounterValue(iostats,
+        COMMITTER_BYTES_COMMITTED_COUNT, totalFiles * 2);
+  }
+
+  @Test
+  public void test_440_validateSuccessFiles() throws Throwable {
+
+    // load in the success data.
+    final FileSystem fs = getFileSystem();
+    ManifestSuccessData successData = loadAndPrintSuccessData(
+        fs,
+        getJobStageConfig().getJobSuccessMarkerPath());
+    validateGeneratedFiles(fs,
+        getJobStageConfig().getDestinationDir(),
+        successData, false);
+  }
+
+  @Test
+  public void test_0900_cleanupJob() throws Throwable {
+    describe("Cleanup job");
+    CleanupJobStage.Arguments arguments = new CleanupJobStage.Arguments(
+        OP_STAGE_JOB_CLEANUP, true, true, false);
+    // the first run will list the three task attempt dirs and delete each
+    // one before the toplevel dir.
+    CleanupJobStage.Result result = new CleanupJobStage(
+        getJobStageConfig()).apply(arguments);
+    assertCleanupResult(result, CleanupJobStage.Outcome.PARALLEL_DELETE, 1 + 3);
+    assertPathDoesNotExist("Job attempt dir", result.getDirectory());
+
+    // not an error if we retry and the dir isn't there
+    result = new CleanupJobStage(getJobStageConfig()).apply(arguments);
+    assertCleanupResult(result, CleanupJobStage.Outcome.NOTHING_TO_CLEAN_UP, 0);
+  }
+
+  /**
+   * Needed to clean up the shared test root, as test case teardown
+   * does not do it.
+   */
+  //@Test
+  public void test_9999_cleanupTestDir() throws Throwable {
+    if (shouldDeleteTestRootAtEndOfTestRun()) {
+      deleteSharedTestRoot();
+    }
+  }
+
+}

+ 141 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestLoadManifestsStage.java

@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CreateOutputDirectoriesStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.LoadManifestsStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupJobStage;
+
+/**
+ * Test loading manifests from a store.
+ * By not creating files we can simulate a large job just by
+ * creating the manifests.
+ * The SaveTaskManifestStage stage is used for the save operation;
+ * this does a save + rename.
+ * For better test performance against a remote store, a thread
+ * pool is used to save the manifests in parallel.
+ */
+public class TestLoadManifestsStage extends AbstractManifestCommitterTest {
+
+  private int taskAttemptCount;
+
+  /**
+   * How many task attempts to make?
+   * Override point.
+   * @return a number greater than 0.
+   */
+  protected int numberOfTaskAttempts() {
+    return ManifestCommitterTestSupport.NUMBER_OF_TASK_ATTEMPTS;
+  }
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    taskAttemptCount = numberOfTaskAttempts();
+    Assertions.assertThat(taskAttemptCount)
+        .describedAs("Task attempt count")
+        .isGreaterThan(0);
+  }
+
+  /**
+   * Build a large number of manifests, but without the real files
+   * and directories.
+   * Save the manifests under the job attempt dir, then load
+   * them via the {@link LoadManifestsStage}.
+   * The directory preparation process is then executed after this.
+   * Because we know each task attempt creates the same number of directories,
+   * they will all be merged and so only a limited number of output dirs
+   * will be created.
+   */
+  @Test
+  public void testSaveThenLoadManyManifests() throws Throwable {
+
+    describe("Creating many manifests with fake file/dir entries,"
+        + " load them and prepare the output dirs.");
+
+    int filesPerTaskAttempt = 10;
+    LOG.info("Number of task attempts: {}, files per task attempt {}",
+        taskAttemptCount, filesPerTaskAttempt);
+
+    setJobStageConfig(createStageConfigForJob(JOB1, getDestDir()));
+
+    // set up the job.
+    new SetupJobStage(getJobStageConfig()).apply(false);
+
+    LOG.info("Creating manifest files for {}", taskAttemptCount);
+
+    executeTaskAttempts(taskAttemptCount, filesPerTaskAttempt);
+
+    LOG.info("Loading in the manifests");
+
+    // Load in the manifests
+    LoadManifestsStage stage = new LoadManifestsStage(
+        getJobStageConfig());
+
+    LoadManifestsStage.Result result = stage.apply(true);
+    LoadManifestsStage.SummaryInfo summary = result.getSummary();
+    List<TaskManifest> loadedManifests = result.getManifests();
+
+    Assertions.assertThat(summary.getManifestCount())
+        .describedAs("Manifest count of  %s", summary)
+        .isEqualTo(taskAttemptCount);
+    Assertions.assertThat(summary.getFileCount())
+        .describedAs("File count of  %s", summary)
+        .isEqualTo(taskAttemptCount * (long) filesPerTaskAttempt);
+    Assertions.assertThat(summary.getTotalFileSize())
+        .describedAs("File Size of  %s", summary)
+        .isEqualTo(getTotalDataSize());
+
+    // now that manifest list.
+    List<String> manifestTaskIds = loadedManifests.stream()
+        .map(TaskManifest::getTaskID)
+        .collect(Collectors.toList());
+    Assertions.assertThat(getTaskIds())
+        .describedAs("Task IDs of all tasks")
+        .containsExactlyInAnyOrderElementsOf(manifestTaskIds);
+
+    // now let's see about aggregating a large set of directories
+    Set<Path> createdDirectories = new CreateOutputDirectoriesStage(
+        getJobStageConfig())
+        .apply(loadedManifests)
+        .getCreatedDirectories();
+
+    // but after the merge process, only one per generated file output
+    // dir exists
+    Assertions.assertThat(createdDirectories)
+        .describedAs("Directories created")
+        .hasSize(filesPerTaskAttempt);
+
+    // and skipping the rename stage (which is going to fail),
+    // go straight to cleanup
+    new CleanupJobStage(getJobStageConfig()).apply(
+        new CleanupJobStage.Arguments("", true, true, false));
+  }
+
+}

+ 1801 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestManifestCommitProtocol.java

@@ -0,0 +1,1801 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+
+import org.assertj.core.api.Assertions;
+import org.junit.AfterClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
+import org.apache.hadoop.fs.statistics.IOStatisticsSupport;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobStatus;
+import org.apache.hadoop.mapreduce.MRJobConfig;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.task.JobContextImpl;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.apache.hadoop.util.DurationInfo;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.util.concurrent.HadoopExecutors;
+import org.apache.hadoop.util.functional.RemoteIterators;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.listChildren;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsSourceToString;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
+import static org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory.COMMITTER_FACTORY_CLASS;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_COMMITTER_FACTORY;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SPARK_WRITE_UUID;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_BYTES_COMMITTED_COUNT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_FILES_COMMITTED_COUNT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASKS_COMPLETED_COUNT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_LOAD_MANIFEST;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_ABORT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.createJobSummaryFilename;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.randomJobId;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.validateSuccessFile;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DiagnosticKeys.STAGE;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * This is a contract test for the commit protocol on a target filesystem.
+ * It is subclassed in the ABFS integration tests and elsewhere.
+ * Derived from the S3A protocol suite, which was itself based off
+ * the test suite {@code TestFileOutputCommitter}.
+ *
+ * Some of the methods trigger java warnings about unchecked casts;
+ * it's impossible to remove them, so the checks are suppressed.
+ */
+@SuppressWarnings("unchecked")
+public class TestManifestCommitProtocol
+    extends AbstractManifestCommitterTest {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(TestManifestCommitProtocol.class);
+
+  private static final String SUB_DIR = "SUB_DIR";
+
+  /**
+   * Part of the name of the output of task attempt 0.
+   */
+  protected static final String PART_00000 = "part-m-00000";
+
+  private static final Text KEY_1 = new Text("key1");
+
+  private static final Text KEY_2 = new Text("key2");
+
+  private static final Text VAL_1 = new Text("val1");
+
+  private static final Text VAL_2 = new Text("val2");
+
+  /**
+   * Snapshot of stats, which will be collected from
+   * committers.
+   */
+  private static final IOStatisticsSnapshot IOSTATISTICS =
+      IOStatisticsSupport.snapshotIOStatistics();
+
+  /**
+   * Job ID for jobs.
+   */
+  private final String jobId;
+
+  /**
+   * A random task attempt id for testing.
+   */
+  private final String attempt0;
+
+  /**
+   *  Attempt 0's task attempt ID.
+   */
+  private final TaskAttemptID taskAttempt0;
+
+  /**
+   * TA 1.
+   */
+  private final TaskAttemptID taskAttempt1;
+
+  /**
+   * Attempt 1 string value.
+   */
+  private final String attempt1;
+
+
+  /** A job to abort in test case teardown. */
+  private final List<JobData> abortInTeardown = new ArrayList<>(1);
+
+  /**
+   * Output directory.
+   * This is the directory into which output goes;
+   * all the job files go in _temporary underneath.
+   */
+  private Path outputDir;
+
+  /**
+   * Committer factory which calls back into
+   * {@link #createCommitter(Path, TaskAttemptContext)}.
+   */
+  private final LocalCommitterFactory
+      localCommitterFactory = new LocalCommitterFactory();
+
+  /**
+   * Clean up the output dir. No-op if
+   * {@link #outputDir} is null.
+   * @throws IOException failure to delete
+   */
+  private void cleanupOutputDir() throws IOException {
+    if (outputDir != null) {
+      getFileSystem().delete(outputDir, true);
+    }
+  }
+
+  /**
+   * Constructor.
+   */
+  public TestManifestCommitProtocol() {
+    ManifestCommitterTestSupport.JobAndTaskIDsForTests taskIDs
+        = new ManifestCommitterTestSupport.JobAndTaskIDsForTests(2, 2);
+    jobId = taskIDs.getJobId();
+    attempt0 = taskIDs.getTaskAttempt(0, 0);
+    taskAttempt0 = taskIDs.getTaskAttemptIdType(0, 0);
+    attempt1 = taskIDs.getTaskAttempt(0, 1);
+    taskAttempt1 = taskIDs.getTaskAttemptIdType(0, 1);
+  }
+
+  /**
+   * This must return the name of a suite which is unique to the test.
+   * @return a string which must be unique and a valid path.
+   */
+  protected String suitename() {
+    return "TestManifestCommitProtocolLocalFS";
+  }
+
+  /**
+   * Get the log; can be overridden for test case log.
+   * @return a log.
+   */
+  public Logger log() {
+    return LOG;
+  }
+
+  /**
+   * Overridden method returns the suitename as well as the method name,
+   * so if more than one committer test is run in parallel, paths are
+   * isolated.
+   * @return a name for a method, unique across the suites and test cases.
+   */
+  @Override
+  protected String getMethodName() {
+    return suitename() + "-" + super.getMethodName();
+  }
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+
+    outputDir = path(getMethodName());
+    cleanupOutputDir();
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    describe("teardown");
+    Thread.currentThread().setName("teardown");
+    for (JobData jobData : abortInTeardown) {
+      // stop the job
+      abortJobQuietly(jobData);
+      // and then get its statistics
+      IOSTATISTICS.aggregate(jobData.committer.getIOStatistics());
+    }
+    try {
+      cleanupOutputDir();
+    } catch (IOException e) {
+      log().info("Exception during cleanup", e);
+    }
+    super.teardown();
+  }
+
+  @AfterClass
+  public static void logAggregateIOStatistics() {
+    LOG.info("Final IOStatistics {}",
+        ioStatisticsToPrettyString(IOSTATISTICS));
+  }
+
+  /**
+   * Add the specified job to the current list of jobs to abort in teardown.
+   * @param jobData job data.
+   */
+  protected void abortInTeardown(JobData jobData) {
+    abortInTeardown.add(jobData);
+  }
+
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration conf = super.createConfiguration();
+    bindCommitter(conf);
+    return conf;
+  }
+
+  /***
+   * Set job up to use the manifest committer.
+   * @param conf configuration to set up
+   */
+  protected void bindCommitter(Configuration conf) {
+    conf.set(COMMITTER_FACTORY_CLASS, MANIFEST_COMMITTER_FACTORY);
+  }
+
+  /**
+   * Create a committer for a task.
+   * @param context task context
+   * @return new committer
+   * @throws IOException failure
+   */
+  protected ManifestCommitter createCommitter(
+      TaskAttemptContext context) throws IOException {
+    return createCommitter(getOutputDir(), context);
+  }
+
+  /**
+   * Create a committer for a task and a given output path.
+   * @param outputPath path
+   * @param context task context
+   * @return new committer
+   * @throws IOException failure
+   */
+  protected ManifestCommitter createCommitter(
+      Path outputPath,
+      TaskAttemptContext context) throws IOException {
+    return new ManifestCommitter(outputPath, context);
+  }
+
+  protected Path getOutputDir() {
+    return outputDir;
+  }
+
+  protected String getJobId() {
+    return jobId;
+  }
+
+  protected String getAttempt0() {
+    return attempt0;
+  }
+
+  protected TaskAttemptID getTaskAttempt0() {
+    return taskAttempt0;
+  }
+
+  protected String getAttempt1() {
+    return attempt1;
+  }
+
+  protected TaskAttemptID getTaskAttempt1() {
+    return taskAttempt1;
+  }
+
+  /**
+   * Functional interface for creating committers, designed to allow
+   * different factories to be used to create different failure modes.
+   */
+  @FunctionalInterface
+  public interface CommitterFactory {
+
+    /**
+     * Create a committer for a task.
+     * @param context task context
+     * @return new committer
+     * @throws IOException failure
+     */
+    ManifestCommitter createCommitter(
+        TaskAttemptContext context) throws IOException;
+  }
+
+  /**
+   * The normal committer creation factory, uses the abstract methods
+   * in the class.
+   */
+  protected class LocalCommitterFactory implements CommitterFactory {
+
+    @Override
+    public ManifestCommitter createCommitter(TaskAttemptContext context)
+        throws IOException {
+      return TestManifestCommitProtocol.this
+          .createCommitter(context);
+    }
+  }
+
+  /**
+   * Assert that for a given output, the job context returns a manifest
+   * committer factory. This is what FileOutputFormat does internally,
+   * and is needed to make sure that the relevant settings are being passed
+   * around.
+   * @param context job/task context
+   * @param output destination path.
+   */
+  protected void assertCommitterFactoryIsManifestCommitter(
+      JobContext context, Path output) {
+
+    final Configuration conf = context.getConfiguration();
+    // check one: committer
+    assertConfigurationUsesManifestCommitter(conf);
+    final String factoryName = conf.get(COMMITTER_FACTORY_CLASS, "");
+    final PathOutputCommitterFactory factory
+        = PathOutputCommitterFactory.getCommitterFactory(
+        output,
+        conf);
+    Assertions.assertThat(factory)
+        .describedAs("Committer for output path %s"
+                + " and factory name \"%s\"",
+            output, factoryName)
+        .isInstanceOf(ManifestCommitterFactory.class);
+  }
+
+  /**
+   * This is to debug situations where the test committer factory
+   * on tasks was binding to FileOutputCommitter even when
+   * tests were overriding it.
+   * @param conf configuration to probe.
+   */
+  private void assertConfigurationUsesManifestCommitter(
+      Configuration conf) {
+    final String factoryName = conf.get(COMMITTER_FACTORY_CLASS, null);
+    Assertions.assertThat(factoryName)
+        .describedAs("Value of %s", COMMITTER_FACTORY_CLASS)
+        .isEqualTo(MANIFEST_COMMITTER_FACTORY);
+  }
+
+  /**
+   * Write some text out.
+   * @param context task
+   * @throws IOException IO failure
+   * @throws InterruptedException write interrupted
+   * @return the path written to
+   */
+  protected Path writeTextOutput(TaskAttemptContext context)
+      throws IOException, InterruptedException {
+    describe("write output");
+    try (DurationInfo d = new DurationInfo(LOG,
+        "Writing Text output for task %s", context.getTaskAttemptID())) {
+      TextOutputForTests.LoggingLineRecordWriter<Writable, Object> writer
+          = new TextOutputForTests<Writable, Object>().getRecordWriter(context);
+      writeOutput(writer, context);
+      return writer.getDest();
+    }
+  }
+
+  /**
+   * Write the standard output.
+   * @param writer record writer
+   * @param context task context
+   * @throws IOException IO failure
+   * @throws InterruptedException write interrupted
+   */
+  private void writeOutput(
+      RecordWriter<Writable, Object> writer,
+      TaskAttemptContext context) throws IOException, InterruptedException {
+    NullWritable nullWritable = NullWritable.get();
+    try (ManifestCommitterTestSupport.CloseWriter<Writable, Object> cw =
+             new ManifestCommitterTestSupport.CloseWriter<>(writer, context)) {
+      writer.write(KEY_1, VAL_1);
+      writer.write(null, nullWritable);
+      writer.write(null, VAL_1);
+      writer.write(nullWritable, VAL_2);
+      writer.write(KEY_2, nullWritable);
+      writer.write(KEY_1, null);
+      writer.write(null, null);
+      writer.write(KEY_2, VAL_2);
+      writer.close(context);
+    }
+  }
+
+  /**
+   * Write the output of a map.
+   * @param writer record writer
+   * @param context task context
+   * @throws IOException IO failure
+   * @throws InterruptedException write interrupted
+   */
+  private void writeMapFileOutput(RecordWriter<WritableComparable<?>, Writable> writer,
+      TaskAttemptContext context) throws IOException, InterruptedException {
+    describe("\nWrite map output");
+    try (DurationInfo d = new DurationInfo(LOG,
+        "Writing Text output for task %s", context.getTaskAttemptID());
+         ManifestCommitterTestSupport.CloseWriter<WritableComparable<?>, Writable> cw =
+             new ManifestCommitterTestSupport.CloseWriter<>(writer, context)) {
+      for (int i = 0; i < 10; ++i) {
+        Text val = ((i & 1) == 1) ? VAL_1 : VAL_2;
+        writer.write(new LongWritable(i), val);
+      }
+      LOG.debug("Closing writer {}", writer);
+      writer.close(context);
+    }
+  }
+
+  /**
+   * Details on a job for use in {@code startJob} and elsewhere.
+   */
+  protected static final class JobData {
+
+    private final Job job;
+
+    private final JobContext jContext;
+
+    private final TaskAttemptContext tContext;
+
+    private final ManifestCommitter committer;
+
+    private final Configuration conf;
+
+    private Path writtenTextPath; // null if not written to
+
+    public JobData(Job job,
+        JobContext jContext,
+        TaskAttemptContext tContext,
+        ManifestCommitter committer) {
+      this.job = job;
+      this.jContext = jContext;
+      this.tContext = tContext;
+      this.committer = committer;
+      conf = job.getConfiguration();
+    }
+
+    public String jobId() {
+      return committer.getJobUniqueId();
+    }
+  }
+
+  /**
+   * Create a new job. Sets the task attempt ID,
+   * and output dir; asks for a success marker.
+   * @return the new job
+   * @throws IOException failure
+   */
+  public Job newJob() throws IOException {
+    return newJob(outputDir, getConfiguration(), attempt0);
+  }
+
+  /**
+   * Create a new job. Sets the task attempt ID,
+   * and output dir; asks for a success marker.
+   * Committer factory is set to manifest factory, so is independent
+   * of FS schema.
+   * @param dir dest dir
+   * @param configuration config to get the job from
+   * @param taskAttemptId task attempt
+   * @return the new job
+   * @throws IOException failure
+   */
+  private Job newJob(Path dir, Configuration configuration,
+      String taskAttemptId) throws IOException {
+    Job job = Job.getInstance(configuration);
+    Configuration conf = job.getConfiguration();
+    conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskAttemptId);
+    enableManifestCommitter(conf);
+    FileOutputFormat.setOutputPath(job, dir);
+    return job;
+  }
+
+  /**
+   * Start a job with a committer; optionally write the test data.
+   * Always register the job to be aborted (quietly) in teardown.
+   * This is, from an "OO-purity perspective" the wrong kind of method to
+   * do: it's setting things up, mixing functionality, registering for teardown.
+   * Its aim is simple though: a common body of code for starting work
+   * in test cases.
+   * @param writeText should the text be written?
+   * @return the job data 4-tuple
+   * @throws IOException IO problems
+   * @throws InterruptedException interruption during write
+   */
+  protected JobData startJob(boolean writeText)
+      throws IOException, InterruptedException {
+    return startJob(localCommitterFactory, writeText);
+  }
+
+  /**
+   * Start a job with a committer; optionally write the test data.
+   * Always register the job to be aborted (quietly) in teardown.
+   * This is, from an "OO-purity perspective" the wrong kind of method to
+   * do: it's setting things up, mixing functionality, registering for teardown.
+   * Its aim is simple though: a common body of code for starting work
+   * in test cases.
+   * @param factory the committer factory to use
+   * @param writeText should the text be written?
+   * @return the job data 4-tuple
+   * @throws IOException IO problems
+   * @throws InterruptedException interruption during write
+   */
+  protected JobData startJob(CommitterFactory factory, boolean writeText)
+      throws IOException, InterruptedException {
+    Job job = newJob();
+    Configuration conf = job.getConfiguration();
+    assertConfigurationUsesManifestCommitter(conf);
+    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0);
+    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 1);
+    JobContext jContext = new JobContextImpl(conf, taskAttempt0.getJobID());
+    TaskAttemptContext tContext = new TaskAttemptContextImpl(conf,
+        taskAttempt0);
+    ManifestCommitter committer = factory.createCommitter(tContext);
+
+    // setup
+    JobData jobData = new JobData(job, jContext, tContext, committer);
+    setupJob(jobData);
+    abortInTeardown(jobData);
+
+    if (writeText) {
+      // write output
+      jobData.writtenTextPath = writeTextOutput(tContext);
+    }
+    return jobData;
+  }
+
+  /**
+   * Set up the job and task.
+   * @param jobData job data
+   * @throws IOException problems
+   */
+  protected void setupJob(JobData jobData) throws IOException {
+    ManifestCommitter committer = jobData.committer;
+    JobContext jContext = jobData.jContext;
+    TaskAttemptContext tContext = jobData.tContext;
+    describe("\nsetup job");
+    try (DurationInfo d = new DurationInfo(LOG,
+        "setup job %s", jContext.getJobID())) {
+      committer.setupJob(jContext);
+    }
+    setupCommitter(committer, tContext);
+    describe("setup complete");
+  }
+
+  private void setupCommitter(
+      final ManifestCommitter committer,
+      final TaskAttemptContext tContext) throws IOException {
+    try (DurationInfo d = new DurationInfo(LOG,
+        "setup task %s", tContext.getTaskAttemptID())) {
+      committer.setupTask(tContext);
+    }
+  }
+
+  /**
+   * Abort a job quietly.
+   * @param jobData job info
+   */
+  protected void abortJobQuietly(JobData jobData) {
+    abortJobQuietly(jobData.committer, jobData.jContext, jobData.tContext);
+  }
+
+  /**
+   * Abort a job quietly: first task, then job.
+   * @param committer committer
+   * @param jContext job context
+   * @param tContext task context
+   */
+  protected void abortJobQuietly(ManifestCommitter committer,
+      JobContext jContext,
+      TaskAttemptContext tContext) {
+    describe("\naborting task");
+    try {
+      committer.abortTask(tContext);
+    } catch (Exception e) {
+      log().warn("Exception aborting task:", e);
+    }
+    describe("\naborting job");
+    try {
+      committer.abortJob(jContext, JobStatus.State.KILLED);
+    } catch (Exception e) {
+      log().warn("Exception aborting job", e);
+    }
+  }
+
+  /**
+   * Commit the task and then the job.
+   * @param committer committer
+   * @param jContext job context
+   * @param tContext task context
+   * @throws IOException problems
+   */
+  protected void commitTaskAndJob(ManifestCommitter committer,
+      JobContext jContext,
+      TaskAttemptContext tContext) throws IOException {
+    try (DurationInfo d = new DurationInfo(LOG,
+        "committing Job %s", jContext.getJobID())) {
+      describe("\ncommitting task");
+      committer.commitTask(tContext);
+      describe("\ncommitting job");
+      committer.commitJob(jContext);
+      describe("commit complete\n");
+    }
+  }
+
+  /**
+   * Execute work as part of a test, after creating the job.
+   * After the execution, {@link #abortJobQuietly(JobData)} is
+   * called for abort/cleanup.
+   * @param name name of work (for logging)
+   * @param action action to execute
+   * @throws Exception failure
+   */
+  protected void executeWork(String name, ActionToTest action)
+      throws Exception {
+    executeWork(name, startJob(false), action);
+  }
+
+  /**
+   * Execute work as part of a test, against the created job.
+   * After the execution, {@link #abortJobQuietly(JobData)} is
+   * called for abort/cleanup.
+   * @param name name of work (for logging)
+   * @param jobData job info
+   * @param action action to execute
+   * @throws Exception failure
+   */
+  public void executeWork(String name,
+      JobData jobData,
+      ActionToTest action) throws Exception {
+    try (DurationInfo d = new DurationInfo(LOG, "Executing %s", name)) {
+      action.exec(jobData.job,
+          jobData.jContext,
+          jobData.tContext,
+          jobData.committer);
+    } finally {
+      abortJobQuietly(jobData);
+    }
+  }
+
+  /**
+   * Load a manifest from the test FS.
+   * @param path path
+   * @return the manifest
+   * @throws IOException failure to load
+   */
+  TaskManifest loadManifest(Path path) throws IOException {
+    return TaskManifest.load(getFileSystem(), path);
+  }
+
+  /**
+   * Verify that recovery doesn't work for these committers.
+   */
+  @Test
+  @SuppressWarnings("deprecation")
+  public void testRecoveryAndCleanup() throws Exception {
+    describe("Test (unsupported) task recovery.");
+    JobData jobData = startJob(true);
+    TaskAttemptContext tContext = jobData.tContext;
+    ManifestCommitter committer = jobData.committer;
+
+    Assertions.assertThat(committer.getWorkPath())
+        .as("null workPath in committer " + committer)
+        .isNotNull();
+    Assertions.assertThat(committer.getOutputPath())
+        .as("null outputPath in committer " + committer)
+        .isNotNull();
+
+    // Commit the task.
+    commitTask(committer, tContext);
+
+    // load and log the manifest
+    final TaskManifest manifest = loadManifest(
+        committer.getTaskManifestPath(tContext));
+    LOG.info("Manifest {}", manifest);
+
+    Configuration conf2 = jobData.job.getConfiguration();
+    conf2.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0);
+    conf2.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 2);
+    JobContext jContext2 = new JobContextImpl(conf2, taskAttempt0.getJobID());
+    TaskAttemptContext tContext2 = new TaskAttemptContextImpl(conf2,
+        taskAttempt0);
+    ManifestCommitter committer2 = createCommitter(tContext2);
+    committer2.setupJob(tContext2);
+
+    Assertions.assertThat(committer2.isRecoverySupported())
+        .as("recoverySupported in " + committer2)
+        .isFalse();
+    intercept(IOException.class, "recover",
+        () -> committer2.recoverTask(tContext2));
+
+    // at this point, task attempt 0 has failed to recover
+    // it should be abortable though. This will be a no-op as it already
+    // committed
+    describe("aborting task attempt 2; expect nothing to clean up");
+    committer2.abortTask(tContext2);
+    describe("Aborting job 2; expect pending commits to be aborted");
+    committer2.abortJob(jContext2, JobStatus.State.KILLED);
+  }
+
+  /**
+   * Assert that the task attempt FS Doesn't have a task attempt
+   * directory.
+   * @param committer committer
+   * @param context task context
+   * @throws IOException IO failure.
+   */
+  protected void assertTaskAttemptPathDoesNotExist(
+      ManifestCommitter committer, TaskAttemptContext context)
+      throws IOException {
+    Path attemptPath = committer.getTaskAttemptPath(context);
+    ContractTestUtils.assertPathDoesNotExist(
+        attemptPath.getFileSystem(context.getConfiguration()),
+        "task attempt dir",
+        attemptPath);
+  }
+
+  protected void assertJobAttemptPathDoesNotExist(
+      ManifestCommitter committer, JobContext context)
+      throws IOException {
+    Path attemptPath = committer.getJobAttemptPath(context);
+    ContractTestUtils.assertPathDoesNotExist(
+        attemptPath.getFileSystem(context.getConfiguration()),
+        "job attempt dir",
+        attemptPath);
+  }
+
+  /**
+   * Verify the output of the directory.
+   * That includes the {@code part-m-00000-*}
+   * file existence and contents, as well as optionally, the success marker.
+   * @param dir directory to scan.
+   * @param expectSuccessMarker check the success marker?
+   * @param expectedJobId job ID, verified if non-empty and success data loaded
+   * @throws Exception failure.
+   * @return the success data
+   */
+  private ManifestSuccessData validateContent(Path dir,
+      boolean expectSuccessMarker,
+      String expectedJobId) throws Exception {
+    lsR(getFileSystem(), dir, true);
+    ManifestSuccessData successData;
+    if (expectSuccessMarker) {
+      successData = verifySuccessMarker(dir, expectedJobId);
+    } else {
+      successData = null;
+    }
+    Path expectedFile = getPart0000(dir);
+    log().debug("Validating content in {}", expectedFile);
+    StringBuilder expectedOutput = new StringBuilder();
+    expectedOutput.append(KEY_1).append('\t').append(VAL_1).append("\n");
+    expectedOutput.append(VAL_1).append("\n");
+    expectedOutput.append(VAL_2).append("\n");
+    expectedOutput.append(KEY_2).append("\n");
+    expectedOutput.append(KEY_1).append("\n");
+    expectedOutput.append(KEY_2).append('\t').append(VAL_2).append("\n");
+    String output = readFile(expectedFile);
+    Assertions.assertThat(output)
+        .describedAs("Content of %s", expectedFile)
+        .isEqualTo(expectedOutput.toString());
+    return successData;
+  }
+
+  /**
+   * Identify any path under the directory which begins with the
+   * {@code "part-m-00000"} sequence. There's some compensation for
+   * eventual consistency here.
+   * @param dir directory to scan
+   * @return the full path
+   * @throws FileNotFoundException the path is missing.
+   * @throws Exception failure.
+   */
+  protected Path getPart0000(final Path dir) throws Exception {
+    final FileSystem fs = dir.getFileSystem(getConfiguration());
+    FileStatus[] statuses = fs.listStatus(dir,
+        path -> path.getName().startsWith(PART_00000));
+    if (statuses.length != 1) {
+      // fail, with a listing of the parent dir
+      ContractTestUtils.assertPathExists(fs, "Output file",
+          new Path(dir, PART_00000));
+    }
+    return statuses[0].getPath();
+  }
+
+  /**
+   * Look for the partFile subdir of the output dir
+   * and the ma and data entries.
+   * @param fs filesystem
+   * @param dir output dir
+   * @throws Exception failure.
+   */
+  private void validateMapFileOutputContent(
+      FileSystem fs, Path dir) throws Exception {
+    // map output is a directory with index and data files
+    assertPathExists("Map output", dir);
+    Path expectedMapDir = getPart0000(dir);
+    assertPathExists("Map output", expectedMapDir);
+    assertIsDirectory(expectedMapDir);
+    FileStatus[] files = fs.listStatus(expectedMapDir);
+    Assertions.assertThat(files)
+        .as("No files found in " + expectedMapDir)
+        .isNotEmpty();
+    assertPathExists("index file in " + expectedMapDir,
+        new Path(expectedMapDir, MapFile.INDEX_FILE_NAME));
+    assertPathExists("data file in " + expectedMapDir,
+        new Path(expectedMapDir, MapFile.DATA_FILE_NAME));
+  }
+
+  /**
+   * Full test of the expected lifecycle: start job, task, write, commit task,
+   * commit job.
+   * @throws Exception on a failure
+   */
+  @Test
+  public void testCommitLifecycle() throws Exception {
+    describe("Full test of the expected lifecycle:\n" +
+        " start job, task, write, commit task, commit job.\n" +
+        "Verify:\n" +
+        "* no files are visible after task commit\n" +
+        "* the expected file is visible after job commit\n");
+    JobData jobData = startJob(false);
+    JobContext jContext = jobData.jContext;
+    TaskAttemptContext tContext = jobData.tContext;
+    ManifestCommitter committer = jobData.committer;
+    assertCommitterFactoryIsManifestCommitter(tContext,
+        tContext.getWorkingDirectory());
+    validateTaskAttemptWorkingDirectory(committer, tContext);
+
+    // write output
+    describe("1. Writing output");
+    final Path textOutputPath = writeTextOutput(tContext);
+    describe("Output written to %s", textOutputPath);
+
+    describe("2. Committing task");
+    Assertions.assertThat(committer.needsTaskCommit(tContext))
+        .as("No files to commit were found by " + committer)
+        .isTrue();
+    commitTask(committer, tContext);
+    final TaskManifest taskManifest = requireNonNull(
+        committer.getTaskAttemptCommittedManifest(), "committerTaskManifest");
+    final String manifestJSON = taskManifest.toJson();
+    LOG.info("Task manifest {}", manifestJSON);
+    int filesCreated = 1;
+    Assertions.assertThat(taskManifest.getFilesToCommit())
+        .describedAs("Files to commit in task manifest %s", manifestJSON)
+        .hasSize(filesCreated);
+    Assertions.assertThat(taskManifest.getDestDirectories())
+        .describedAs("Directories to create in task manifest %s",
+            manifestJSON)
+        .isEmpty();
+
+    // this is only task commit; there MUST be no part- files in the dest dir
+    try {
+      RemoteIterators.foreach(getFileSystem().listFiles(outputDir, false),
+          (status) ->
+              Assertions.assertThat(status.getPath().toString())
+                  .as("task committed file to dest :" + status)
+                  .contains("part"));
+    } catch (FileNotFoundException ignored) {
+      log().info("Outdir {} is not created by task commit phase ",
+          outputDir);
+    }
+
+    describe("3. Committing job");
+
+    commitJob(committer, jContext);
+
+    // validate output
+    describe("4. Validating content");
+    String jobUniqueId = jobData.jobId();
+    ManifestSuccessData successData = validateContent(outputDir,
+        true,
+        jobUniqueId);
+    // look in the SUMMARY
+    Assertions.assertThat(successData.getDiagnostics())
+        .describedAs("Stage entry in SUCCESS")
+        .containsEntry(STAGE, OP_STAGE_JOB_COMMIT);
+    IOStatisticsSnapshot jobStats = successData.getIOStatistics();
+    // manifest
+    verifyStatisticCounterValue(jobStats,
+        OP_LOAD_MANIFEST, 1);
+    FileStatus st = getFileSystem().getFileStatus(getPart0000(outputDir));
+    verifyStatisticCounterValue(jobStats,
+        COMMITTER_FILES_COMMITTED_COUNT, filesCreated);
+    verifyStatisticCounterValue(jobStats,
+        COMMITTER_BYTES_COMMITTED_COUNT, st.getLen());
+
+    // now load and examine the job report.
+    // this MUST contain all the stats of the summary, plus timings on
+    // job commit itself
+
+    ManifestSuccessData report = loadReport(jobUniqueId, true);
+    Map<String, String> diag = report.getDiagnostics();
+    Assertions.assertThat(diag)
+        .describedAs("Stage entry in report")
+        .containsEntry(STAGE, OP_STAGE_JOB_COMMIT);
+    IOStatisticsSnapshot reportStats = report.getIOStatistics();
+    verifyStatisticCounterValue(reportStats,
+        OP_LOAD_MANIFEST, 1);
+    verifyStatisticCounterValue(reportStats,
+        OP_STAGE_JOB_COMMIT, 1);
+    verifyStatisticCounterValue(reportStats,
+        COMMITTER_FILES_COMMITTED_COUNT, filesCreated);
+    verifyStatisticCounterValue(reportStats,
+        COMMITTER_BYTES_COMMITTED_COUNT, st.getLen());
+
+  }
+
+  /**
+   * Load a summary from the report dir.
+   * @param jobUniqueId job ID
+   * @param expectSuccess is the job expected to have succeeded.
+   * @throws IOException failure to load
+   * @return the report
+   */
+  private ManifestSuccessData loadReport(String jobUniqueId,
+      boolean expectSuccess) throws IOException {
+    File file = new File(getReportDir(),
+        createJobSummaryFilename(jobUniqueId));
+    ContractTestUtils.assertIsFile(FileSystem.getLocal(getConfiguration()),
+        new Path(file.toURI()));
+    ManifestSuccessData report = ManifestSuccessData.serializer().load(file);
+    LOG.info("Report for job {}:\n{}", jobUniqueId, report.toJson());
+    Assertions.assertThat(report.getSuccess())
+        .describedAs("success flag in report")
+        .isEqualTo(expectSuccess);
+    return report;
+  }
+
+  /**
+   * Repeated commit call after job commit.
+   */
+  @Test
+  public void testCommitterWithDuplicatedCommit() throws Exception {
+    describe("Call a task then job commit twice;" +
+        "expect the second task commit to fail.");
+    JobData jobData = startJob(true);
+    JobContext jContext = jobData.jContext;
+    TaskAttemptContext tContext = jobData.tContext;
+    ManifestCommitter committer = jobData.committer;
+
+    // do commit
+    describe("committing task");
+    committer.commitTask(tContext);
+
+    // repeated commit while TA dir exists fine/idempotent
+    committer.commitTask(tContext);
+
+    describe("committing job");
+    committer.commitJob(jContext);
+    describe("commit complete\n");
+
+    describe("cleanup");
+    committer.cleanupJob(jContext);
+    // validate output
+    validateContent(outputDir, shouldExpectSuccessMarker(),
+        committer.getJobUniqueId());
+
+    // commit task to fail on retry as task attempt dir doesn't exist
+    describe("Attempting commit of the same task after job commit -expecting failure");
+    expectFNFEonTaskCommit(committer, tContext);
+  }
+
+  /**
+   * HADOOP-17258. If a second task attempt is committed, it
+   * must succeed, and the output of the first TA, even if already
+   * committed, MUST NOT be visible in the final output.
+   * <p></p>
+   * What's important is not just that only one TA must succeed,
+   * but it must be the last one executed.
+   */
+  @Test
+  public void testTwoTaskAttemptsCommit() throws Exception {
+    describe("Commit two task attempts;" +
+        " expect the second attempt to succeed.");
+    JobData jobData = startJob(false);
+    TaskAttemptContext tContext = jobData.tContext;
+    ManifestCommitter committer = jobData.committer;
+    // do commit
+    describe("\ncommitting task");
+    // write output for TA 1,
+    Path outputTA1 = writeTextOutput(tContext);
+
+    // speculatively execute committer 2.
+
+    // jobconf with a different base to its parts.
+    Configuration conf2 = jobData.conf;
+    conf2.set("mapreduce.output.basename", "attempt2");
+    String attempt2 = "attempt_" + jobId + "_m_000000_1";
+    TaskAttemptID ta2 = TaskAttemptID.forName(attempt2);
+    TaskAttemptContext tContext2 = new TaskAttemptContextImpl(
+        conf2, ta2);
+
+    ManifestCommitter committer2 = localCommitterFactory
+        .createCommitter(tContext2);
+    setupCommitter(committer2, tContext2);
+
+    // verify working dirs are different
+    Assertions.assertThat(committer.getWorkPath())
+        .describedAs("Working dir of %s", committer)
+        .isNotEqualTo(committer2.getWorkPath());
+
+    // write output for TA 2,
+    Path outputTA2 = writeTextOutput(tContext2);
+
+    // verify the names are different.
+    String name1 = outputTA1.getName();
+    String name2 = outputTA2.getName();
+    Assertions.assertThat(name1)
+        .describedAs("name of task attempt output %s", outputTA1)
+        .isNotEqualTo(name2);
+
+    // commit task 1
+    committer.commitTask(tContext);
+
+    // then pretend that task1 didn't respond, so
+    // commit task 2
+    committer2.commitTask(tContext2);
+
+    // and the job
+    committer2.commitJob(tContext);
+
+    // validate output
+    FileSystem fs = getFileSystem();
+    ManifestSuccessData successData = validateSuccessFile(fs, outputDir,
+        1,
+        "");
+    Assertions.assertThat(successData.getFilenames())
+        .describedAs("Files committed")
+        .hasSize(1);
+
+    assertPathExists("attempt2 output", new Path(outputDir, name2));
+    assertPathDoesNotExist("attempt1 output", new Path(outputDir, name1));
+
+  }
+
+  protected boolean shouldExpectSuccessMarker() {
+    return true;
+  }
+
+  /**
+   * Simulate a failure on the first job commit; expect the
+   * second to succeed.
+   */
+  /*@Test
+  public void testCommitterWithFailure() throws Exception {
+    describe("Fail the first job commit then retry");
+    JobData jobData = startJob(new FailingCommitterFactory(), true);
+    JobContext jContext = jobData.jContext;
+    TaskAttemptContext tContext = jobData.tContext;
+    ManifestCommitter committer = jobData.committer;
+
+    // do commit
+    committer.commitTask(tContext);
+
+    // now fail job
+    expectSimulatedFailureOnJobCommit(jContext, committer);
+
+    commitJob(committer, jContext);
+
+    // but the data got there, due to the order of operations.
+    validateContent(outDir, shouldExpectSuccessMarker(),
+        committer.getUUID());
+    expectJobCommitToFail(jContext, committer);
+  }
+*/
+
+  /**
+   * Override point: the failure expected on the attempt to commit a failed
+   * job.
+   * @param jContext job context
+   * @param committer committer
+   * @throws Exception any unexpected failure.
+   */
+  protected void expectJobCommitToFail(JobContext jContext,
+      ManifestCommitter committer) throws Exception {
+    // next attempt will fail as there is no longer a directory to commit
+    expectJobCommitFailure(jContext, committer,
+        FileNotFoundException.class);
+  }
+
+  /**
+   * Expect a job commit operation to fail with a specific exception.
+   * @param jContext job context
+   * @param committer committer
+   * @param clazz class of exception
+   * @return the caught exception
+   * @throws Exception any unexpected failure.
+   */
+  protected static <E extends IOException> E expectJobCommitFailure(
+      JobContext jContext,
+      ManifestCommitter committer,
+      Class<E> clazz)
+      throws Exception {
+
+    return intercept(clazz,
+        () -> {
+          committer.commitJob(jContext);
+          return committer.toString();
+        });
+  }
+
+  protected static void expectFNFEonTaskCommit(
+      ManifestCommitter committer,
+      TaskAttemptContext tContext) throws Exception {
+    intercept(FileNotFoundException.class,
+        () -> {
+          committer.commitTask(tContext);
+          return committer.toString();
+        });
+  }
+
+  /**
+   * Commit a task with no output.
+   * Dest dir should exist.
+   */
+  @Test
+  public void testCommitterWithNoOutputs() throws Exception {
+    describe("Have a task and job with no outputs: expect success");
+    JobData jobData = startJob(localCommitterFactory, false);
+    TaskAttemptContext tContext = jobData.tContext;
+    ManifestCommitter committer = jobData.committer;
+
+    // do commit
+    committer.commitTask(tContext);
+    Path attemptPath = committer.getTaskAttemptPath(tContext);
+    ContractTestUtils.assertPathExists(
+        attemptPath.getFileSystem(tContext.getConfiguration()),
+        "task attempt dir",
+        attemptPath);
+  }
+
+
+  @Test
+  public void testMapFileOutputCommitter() throws Exception {
+    describe("Test that the committer generates map output into a directory\n" +
+        "starting with the prefix part-");
+    JobData jobData = startJob(false);
+    JobContext jContext = jobData.jContext;
+    TaskAttemptContext tContext = jobData.tContext;
+    ManifestCommitter committer = jobData.committer;
+    Configuration conf = jobData.conf;
+
+    // write output
+    writeMapFileOutput(new MapFileOutputFormat()
+            .getRecordWriter(tContext), tContext);
+
+    // do commit
+    commitTaskAndJob(committer, jContext, tContext);
+    FileSystem fs = getFileSystem();
+
+    lsR(fs, outputDir, true);
+    String ls = ls(outputDir);
+    describe("\nvalidating");
+
+    // validate output
+    verifySuccessMarker(outputDir, committer.getJobUniqueId());
+
+    describe("validate output of %s", outputDir);
+    validateMapFileOutputContent(fs, outputDir);
+
+    // Ensure getReaders call works and also ignores
+    // hidden filenames (_ or . prefixes)
+    describe("listing");
+    FileStatus[] filtered = fs.listStatus(outputDir, HIDDEN_FILE_FILTER);
+    Assertions.assertThat(filtered)
+        .describedAs("listed children under %s", ls)
+        .hasSize(1);
+    FileStatus fileStatus = filtered[0];
+    Assertions.assertThat(fileStatus.getPath().getName())
+        .as("Not the part file: " + fileStatus)
+        .startsWith(PART_00000);
+
+    describe("getReaders()");
+    Assertions.assertThat(getReaders(fs, outputDir, conf))
+        .describedAs("getReaders() MapFile.Reader entries with shared FS %s %s", outputDir, ls)
+        .hasSize(1);
+
+    describe("getReaders(new FS)");
+    FileSystem fs2 = FileSystem.get(outputDir.toUri(), conf);
+    Assertions.assertThat(getReaders(fs2, outputDir, conf))
+        .describedAs("getReaders(new FS) %s %s", outputDir, ls)
+        .hasSize(1);
+
+    describe("MapFileOutputFormat.getReaders");
+    Assertions.assertThat(MapFileOutputFormat.getReaders(outputDir, conf))
+        .describedAs("MapFileOutputFormat.getReaders(%s) %s", outputDir, ls)
+        .hasSize(1);
+
+  }
+
+  /** Open the output generated by this format. */
+  @SuppressWarnings("IOResourceOpenedButNotSafelyClosed")
+  private static MapFile.Reader[] getReaders(FileSystem fs,
+      Path dir,
+      Configuration conf) throws IOException {
+    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, HIDDEN_FILE_FILTER));
+
+    // sort names, so that hash partitioning works
+    Arrays.sort(names);
+
+    MapFile.Reader[] parts = new MapFile.Reader[names.length];
+    for (int i = 0; i < names.length; i++) {
+      parts[i] = new MapFile.Reader(names[i], conf);
+    }
+    return parts;
+  }
+
+  public static final PathFilter HIDDEN_FILE_FILTER = (path) ->
+      !path.getName().startsWith("_") && !path.getName().startsWith(".");
+
+  /**
+   * A functional interface which an action to test must implement.
+   */
+  @FunctionalInterface
+  public interface ActionToTest {
+
+    void exec(Job job, JobContext jContext, TaskAttemptContext tContext,
+        ManifestCommitter committer) throws Exception;
+  }
+
+  @Test
+  public void testAbortTaskNoWorkDone() throws Exception {
+    executeWork("abort task no work",
+        (job, jContext, tContext, committer) ->
+            committer.abortTask(tContext));
+  }
+
+  @Test
+  public void testAbortJobNoWorkDone() throws Exception {
+    executeWork("abort task no work",
+        (job, jContext, tContext, committer) ->
+            committer.abortJob(jContext, JobStatus.State.RUNNING));
+  }
+
+  @Test
+  public void testCommitJobButNotTask() throws Exception {
+    executeWork("commit a job while a task's work is pending, " +
+            "expect task writes to be cancelled.",
+        (job, jContext, tContext, committer) -> {
+          // step 1: write the text
+          writeTextOutput(tContext);
+          // step 2: commit the job
+          createCommitter(tContext).commitJob(tContext);
+          // verify that no output can be observed
+          assertPart0000DoesNotExist(outputDir);
+        }
+    );
+  }
+
+  @Test
+  public void testAbortTaskThenJob() throws Exception {
+    JobData jobData = startJob(true);
+    ManifestCommitter committer = jobData.committer;
+
+    // do abort
+    committer.abortTask(jobData.tContext);
+
+    intercept(FileNotFoundException.class, "",
+        () -> getPart0000(committer.getWorkPath()));
+
+    committer.abortJob(jobData.jContext, JobStatus.State.FAILED);
+    assertJobAbortCleanedUp(jobData);
+
+  }
+
+  /**
+   * Extension point: assert that the job was all cleaned up after an abort.
+   * Base assertions
+   * <ul>
+   *   <li>Output dir is absent or, if present, empty</li>
+   * </ul>
+   * @param jobData job data
+   * @throws Exception failure
+   */
+  public void assertJobAbortCleanedUp(JobData jobData) throws Exception {
+    FileSystem fs = getFileSystem();
+    try {
+      FileStatus[] children = listChildren(fs, outputDir);
+      if (children.length != 0) {
+        lsR(fs, outputDir, true);
+      }
+      Assertions.assertThat(children)
+          .as("Output directory not empty " + ls(outputDir))
+          .containsExactly(new FileStatus[0]);
+    } catch (FileNotFoundException e) {
+      // this is a valid state; it means the dest dir doesn't exist yet.
+    }
+
+  }
+
+  @Test
+  public void testFailAbort() throws Exception {
+    describe("Abort the task, then job (failed), abort the job again");
+    JobData jobData = startJob(true);
+    JobContext jContext = jobData.jContext;
+    TaskAttemptContext tContext = jobData.tContext;
+    ManifestCommitter committer = jobData.committer;
+
+    // do abort
+    committer.abortTask(tContext);
+
+    committer.getJobAttemptPath(jContext);
+    committer.getTaskAttemptPath(tContext);
+    assertPart0000DoesNotExist(outputDir);
+    assertSuccessMarkerDoesNotExist(outputDir);
+    describe("Aborting job into %s", outputDir);
+
+    committer.abortJob(jContext, JobStatus.State.FAILED);
+
+    assertTaskAttemptPathDoesNotExist(committer, tContext);
+    assertJobAttemptPathDoesNotExist(committer, jContext);
+
+    // verify a failure report
+    ManifestSuccessData report = loadReport(jobData.jobId(), false);
+    Map<String, String> diag = report.getDiagnostics();
+    Assertions.assertThat(diag)
+        .describedAs("Stage entry in report")
+        .containsEntry(STAGE, OP_STAGE_JOB_ABORT);
+    IOStatisticsSnapshot reportStats = report.getIOStatistics();
+    verifyStatisticCounterValue(reportStats,
+        OP_STAGE_JOB_ABORT, 1);
+
+    // try again; expect abort to be idempotent.
+    committer.abortJob(jContext, JobStatus.State.FAILED);
+
+  }
+
+  /**
+   * Assert that the given dir does not have the {@code _SUCCESS} marker.
+   * @param dir dir to scan
+   * @throws IOException IO Failure
+   */
+  protected void assertSuccessMarkerDoesNotExist(Path dir) throws IOException {
+    assertPathDoesNotExist("Success marker",
+        new Path(dir, SUCCESS_MARKER));
+  }
+
+  public void assertPart0000DoesNotExist(Path dir) throws Exception {
+    intercept(FileNotFoundException.class,
+        () -> getPart0000(dir));
+    assertPathDoesNotExist("expected output file", new Path(dir, PART_00000));
+  }
+
+  @Test
+  public void testAbortJobNotTask() throws Exception {
+    executeWork("abort task no work",
+        (job, jContext, tContext, committer) -> {
+          // write output
+          writeTextOutput(tContext);
+          committer.abortJob(jContext, JobStatus.State.RUNNING);
+          assertTaskAttemptPathDoesNotExist(
+              committer, tContext);
+          assertJobAttemptPathDoesNotExist(
+              committer, jContext);
+        });
+  }
+
+  /**
+   * This looks at what happens with concurrent commits.
+   * However, the failure condition it looks for (subdir under subdir)
+   * is the kind of failure you see on a rename-based commit.
+   *
+   * What it will not detect is the fact that both tasks will each commit
+   * to the destination directory. That is: whichever commits last wins.
+   *
+   * There's no way to stop this. Instead it is a requirement that the task
+   * commit operation is only executed when the committer is happy to
+   * commit only those tasks which it knows have succeeded, and abort those
+   * which have not.
+   * @throws Exception failure
+   */
+  @Test
+  public void testConcurrentCommitTaskWithSubDir() throws Exception {
+    Job job = newJob();
+    FileOutputFormat.setOutputPath(job, outputDir);
+    final Configuration conf = job.getConfiguration();
+
+    final JobContext jContext =
+        new JobContextImpl(conf, taskAttempt0.getJobID());
+    ManifestCommitter amCommitter = createCommitter(
+        new TaskAttemptContextImpl(conf, taskAttempt0));
+    amCommitter.setupJob(jContext);
+
+    final TaskAttemptContext[] taCtx = new TaskAttemptContextImpl[2];
+    taCtx[0] = new TaskAttemptContextImpl(conf, taskAttempt0);
+    taCtx[1] = new TaskAttemptContextImpl(conf, taskAttempt1);
+
+    // IDE/checkstyle complain here about type casting but they
+    // are confused.
+    final TextOutputFormat<Writable, Object>[] tof =
+        new TextOutputForTests[2];
+
+    for (int i = 0; i < tof.length; i++) {
+      tof[i] = new TextOutputForTests<Writable, Object>() {
+        @Override
+        public Path getDefaultWorkFile(
+            TaskAttemptContext context,
+            String extension) throws IOException {
+          final ManifestCommitter foc = (ManifestCommitter)
+              getOutputCommitter(context);
+          return new Path(new Path(foc.getWorkPath(), SUB_DIR),
+              getUniqueFile(context, getOutputName(context), extension));
+        }
+      };
+    }
+
+    final ExecutorService executor = HadoopExecutors.newFixedThreadPool(2);
+    try {
+      for (int i = 0; i < taCtx.length; i++) {
+        final int taskIdx = i;
+        executor.submit(() -> {
+          final OutputCommitter outputCommitter =
+              tof[taskIdx].getOutputCommitter(taCtx[taskIdx]);
+          outputCommitter.setupTask(taCtx[taskIdx]);
+          writeOutput(tof[taskIdx].getRecordWriter(taCtx[taskIdx]), taCtx[taskIdx]);
+          describe("Committing Task %d", taskIdx);
+          outputCommitter.commitTask(taCtx[taskIdx]);
+          return null;
+        });
+      }
+    } finally {
+      executor.shutdown();
+      while (!executor.awaitTermination(1, TimeUnit.SECONDS)) {
+        log().info("Awaiting thread termination!");
+      }
+    }
+
+    // if we commit here then all tasks will be committed, so there will
+    // be contention for that final directory: both parts will go in.
+
+    describe("\nCommitting Job");
+    amCommitter.commitJob(jContext);
+    assertPathExists("base output directory", outputDir);
+    assertPart0000DoesNotExist(outputDir);
+    Path outSubDir = new Path(outputDir, SUB_DIR);
+    assertPathDoesNotExist("Must not end up with sub_dir/sub_dir",
+        new Path(outSubDir, SUB_DIR));
+
+    // validate output
+    // There's no success marker in the subdirectory
+    validateContent(outSubDir, false, "");
+  }
+
+  @Test
+  public void testUnsupportedSchema() throws Throwable {
+    intercept(PathIOException.class, () ->
+        new ManifestCommitterFactory()
+            .createOutputCommitter(new Path("s3a://unsupported/"), null));
+  }
+
+  /**
+   * Factory for failing committers.
+   */
+
+
+/*
+  protected ManifestCommitter createFailingCommitter(
+  final TaskAttemptContext tContext)
+      throws IOException {
+    //     TODO
+    return null;
+  }
+
+  public class FailingCommitterFactory implements CommitterFactory {
+
+    @Override
+    public ManifestCommitter createCommitter(TaskAttemptContext context)
+        throws IOException {
+      return createFailingCommitter(context);
+    }
+  }*/
+  @Test
+  public void testOutputFormatIntegration() throws Throwable {
+    Configuration conf = getConfiguration();
+    Job job = newJob();
+    assertCommitterFactoryIsManifestCommitter(job, outputDir);
+    job.setOutputFormatClass(TextOutputForTests.class);
+    conf = job.getConfiguration();
+    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0);
+    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 1);
+    JobContext jContext = new JobContextImpl(conf, taskAttempt0.getJobID());
+    TaskAttemptContext tContext = new TaskAttemptContextImpl(conf,
+        taskAttempt0);
+    TextOutputForTests<IntWritable, IntWritable> outputFormat =
+        (TextOutputForTests<IntWritable, IntWritable>)
+            ReflectionUtils.newInstance(tContext.getOutputFormatClass(), conf);
+    ManifestCommitter committer = (ManifestCommitter)
+        outputFormat.getOutputCommitter(tContext);
+
+    // setup
+    JobData jobData = new JobData(job, jContext, tContext, committer);
+    setupJob(jobData);
+    abortInTeardown(jobData);
+    TextOutputForTests.LoggingLineRecordWriter<IntWritable, IntWritable> recordWriter
+        = outputFormat.getRecordWriter(tContext);
+    IntWritable iw = new IntWritable(1);
+    recordWriter.write(iw, iw);
+    long expectedLength = 4;
+    Path dest = recordWriter.getDest();
+    validateTaskAttemptPathDuringWrite(dest, expectedLength);
+    recordWriter.close(tContext);
+    // at this point
+    validateTaskAttemptPathAfterWrite(dest, expectedLength);
+    Assertions.assertThat(committer.needsTaskCommit(tContext))
+        .as("Committer does not have data to commit " + committer)
+        .isTrue();
+    commitTask(committer, tContext);
+    // at this point the committer tasks stats should be current.
+    IOStatisticsSnapshot snapshot = new IOStatisticsSnapshot(
+        committer.getIOStatistics());
+    String commitsCompleted = COMMITTER_TASKS_COMPLETED_COUNT;
+    LOG.info("after task commit {}", ioStatisticsToPrettyString(snapshot));
+    verifyStatisticCounterValue(snapshot,
+        commitsCompleted, 1);
+    final TaskManifest manifest = loadManifest(
+        committer.getTaskManifestPath(tContext));
+    LOG.info("Manifest {}", manifest.toJson());
+
+    commitJob(committer, jContext);
+    LOG.info("committer iostatistics {}",
+        ioStatisticsSourceToString(committer));
+
+    // validate output
+    ManifestSuccessData successData = verifySuccessMarker(outputDir,
+        committer.getJobUniqueId());
+
+    // the task commit count should get through the job commit
+    IOStatisticsSnapshot successStats = successData.getIOStatistics();
+    LOG.info("loaded statistics {}", successStats);
+    verifyStatisticCounterValue(successStats,
+        commitsCompleted, 1);
+  }
+
+  /**
+   * Create a committer through reflection then use it to abort
+   * a task. This mimics the action of an AM when a container fails and
+   * the AM wants to abort the task attempt.
+   */
+  @Test
+  public void testAMWorkflow() throws Throwable {
+    describe("Create a committer with a null output path & use as an AM");
+    JobData jobData = startJob(true);
+    JobContext jContext = jobData.jContext;
+    TaskAttemptContext tContext = jobData.tContext;
+
+    TaskAttemptContext newAttempt = new TaskAttemptContextImpl(
+        jContext.getConfiguration(),
+        taskAttempt0);
+    Configuration conf = jContext.getConfiguration();
+
+    // bind
+    TextOutputForTests.bind(conf);
+
+    OutputFormat<?, ?> outputFormat
+        = ReflectionUtils.newInstance(newAttempt.getOutputFormatClass(), conf);
+    Path outputPath = FileOutputFormat.getOutputPath(newAttempt);
+    Assertions.assertThat(outputPath)
+        .as("null output path in new task attempt")
+        .isNotNull();
+
+    ManifestCommitter committer2 = (ManifestCommitter)
+        outputFormat.getOutputCommitter(newAttempt);
+    committer2.abortTask(tContext);
+
+  }
+
+  /**
+   * Make sure that two jobs in parallel directory trees coexist.
+   * Note: the two jobs are not trying to write to the same
+   * output directory.
+   * That should be possible, but cleanup must be disabled.
+   */
+  @Test
+  public void testParallelJobsToAdjacentPaths() throws Throwable {
+
+    describe("Run two jobs in parallel, assert they both complete");
+    JobData jobData = startJob(true);
+    Job job1 = jobData.job;
+    ManifestCommitter committer1 = jobData.committer;
+    JobContext jContext1 = jobData.jContext;
+    TaskAttemptContext tContext1 = jobData.tContext;
+
+    // now build up a second job
+    String jobId2 = randomJobId();
+    String attempt20 = "attempt_" + jobId2 + "_m_000000_0";
+    TaskAttemptID taskAttempt20 = TaskAttemptID.forName(attempt20);
+    String attempt21 = "attempt_" + jobId2 + "_m_000001_0";
+    TaskAttemptID taskAttempt21 = TaskAttemptID.forName(attempt21);
+
+    Path job1Dest = outputDir;
+    Path job2Dest = new Path(getOutputDir().getParent(),
+        getMethodName() + "job2Dest");
+    // little safety check
+    Assertions.assertThat(job2Dest)
+        .describedAs("Job destinations")
+        .isNotEqualTo(job1Dest);
+
+    // create the second job
+    Job job2 = newJob(job2Dest,
+        unsetUUIDOptions(new JobConf(getConfiguration())),
+        attempt20);
+    Configuration conf2 = job2.getConfiguration();
+    conf2.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 1);
+    ManifestCommitter committer2 = null;
+    try {
+      JobContext jContext2 = new JobContextImpl(conf2,
+          taskAttempt20.getJobID());
+      TaskAttemptContext tContext2 =
+          new TaskAttemptContextImpl(conf2, taskAttempt20);
+      committer2 = createCommitter(job2Dest, tContext2);
+      JobData jobData2 = new JobData(job2, jContext2, tContext2, committer2);
+      setupJob(jobData2);
+      abortInTeardown(jobData2);
+      // make sure the directories are different
+      Assertions.assertThat(committer1.getOutputPath())
+          .describedAs("Committer output path of %s and %s", committer1, committer2)
+          .isNotEqualTo(committer2.getOutputPath());
+      // and job IDs
+      Assertions.assertThat(committer1.getJobUniqueId())
+          .describedAs("JobUnique IDs of %s and %s", committer1, committer2)
+          .isNotEqualTo(committer2.getJobUniqueId());
+
+      // job2 setup, write some data there
+      writeTextOutput(tContext2);
+
+      // at this point, job1 and job2 both have uncommitted tasks
+
+      // commit tasks in order task 2, task 1.
+      commitTask(committer2, tContext2);
+      commitTask(committer1, tContext1);
+
+      // commit jobs in order job 1, job 2
+      commitJob(committer1, jContext1);
+
+      getPart0000(job1Dest);
+
+      commitJob(committer2, jContext2);
+      getPart0000(job2Dest);
+
+    } finally {
+      // clean things up in test failures.
+      FileSystem fs = getFileSystem();
+      if (committer1 != null) {
+        fs.delete(committer1.getOutputPath(), true);
+      }
+      if (committer2 != null) {
+        fs.delete(committer2.getOutputPath(), true);
+      }
+    }
+
+  }
+
+  /**
+   * Strip staging/spark UUID options.
+   * @param conf config
+   * @return the patched config
+   */
+  protected Configuration unsetUUIDOptions(final Configuration conf) {
+    conf.unset(SPARK_WRITE_UUID);
+    return conf;
+  }
+
+  /**
+   * Assert that a committer's job attempt path exists.
+   * For the staging committers, this is in the cluster FS.
+   * @param committer committer
+   * @param jobContext job context
+   * @throws IOException failure
+   */
+  protected void assertJobAttemptPathExists(
+      final ManifestCommitter committer,
+      final JobContext jobContext) throws IOException {
+    Path attemptPath = committer.getJobAttemptPath(jobContext);
+    ContractTestUtils.assertIsDirectory(
+        attemptPath.getFileSystem(committer.getConf()),
+        attemptPath);
+  }
+
+  /**
+   * Validate the path of a file being written to during the write
+   * itself.
+   * @param p path
+   * @param expectedLength
+   * @throws IOException IO failure
+   */
+  protected void validateTaskAttemptPathDuringWrite(Path p,
+      final long expectedLength) throws IOException {
+
+  }
+
+  /**
+   * Validate the path of a file being written to after the write
+   * operation has completed.
+   * @param p path
+   * @param expectedLength
+   * @throws IOException IO failure
+   */
+  protected void validateTaskAttemptPathAfterWrite(Path p,
+      final long expectedLength) throws IOException {
+
+  }
+
+  /**
+   * Perform any actions needed to validate the working directory of
+   * a committer.
+   * For example: filesystem, path attributes
+   * @param committer committer instance
+   * @param context task attempt context
+   * @throws IOException IO failure
+   */
+  protected void validateTaskAttemptWorkingDirectory(
+      ManifestCommitter committer,
+      TaskAttemptContext context) throws IOException {
+  }
+
+  /**
+   * Commit a task then validate the state of the committer afterwards.
+   * @param committer committer
+   * @param tContext task context
+   * @throws IOException IO failure
+   */
+  protected void commitTask(final ManifestCommitter committer,
+      final TaskAttemptContext tContext) throws IOException {
+    committer.commitTask(tContext);
+  }
+
+  /**
+   * Commit a job then validate the state of the committer afterwards.
+   * @param committer committer
+   * @param jContext job context
+   * @throws IOException IO failure
+   */
+  protected void commitJob(final ManifestCommitter committer,
+      final JobContext jContext) throws IOException {
+    committer.commitJob(jContext);
+
+  }
+
+}

+ 379 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestRenameStageFailure.java

@@ -0,0 +1,379 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Assume;
+import org.junit.Test;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.hadoop.fs.CommonPathCapabilities;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.RenameFilesStage;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyFileContents;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertThatStatisticCounter;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.getEtag;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations.SIMULATED_FAILURE;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.AbstractJobOrTaskStage.FAILED_TO_RENAME_PREFIX;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Test renaming files with fault injection.
+ * This explores etag support and overwrite-on-rename semantics
+ * of the target FS, so some of the tests behave differently
+ * on different stores.
+ */
+public class TestRenameStageFailure extends AbstractManifestCommitterTest {
+
+  /**
+   * Statistic to look for.
+   */
+  public static final String RENAME_FAILURES = OP_COMMIT_FILE_RENAME + ".failures";
+  private static final int FAILING_FILE_INDEX = 5;
+
+  /**
+   * Fault Injection.
+   */
+  private UnreliableManifestStoreOperations failures;
+
+  /** etags returned in listing/file status operations? */
+  private boolean etagsSupported;
+
+  /** etags preserved through rename? */
+  private boolean etagsPreserved;
+
+  /** resilient commit expected? */
+  private boolean resilientCommit;
+
+  protected boolean isResilientCommit() {
+    return resilientCommit;
+  }
+
+  protected boolean isEtagsPreserved() {
+    return etagsPreserved;
+  }
+
+  protected boolean isEtagsSupported() {
+    return etagsSupported;
+  }
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    final FileSystem fs = getFileSystem();
+    final Path methodPath = methodPath();
+    etagsSupported = fs.hasPathCapability(methodPath,
+        CommonPathCapabilities.ETAGS_AVAILABLE);
+    etagsPreserved = fs.hasPathCapability(methodPath,
+        CommonPathCapabilities.ETAGS_PRESERVED_IN_RENAME);
+
+    final ManifestStoreOperations wrappedOperations = getStoreOperations();
+    failures
+        = new UnreliableManifestStoreOperations(wrappedOperations);
+    setStoreOperations(failures);
+    resilientCommit = wrappedOperations.storeSupportsResilientCommit();
+  }
+
+  /**
+   * Does this test suite require rename resilience in the store/FS?
+   * @return true if the store operations are resilient.
+   */
+  protected boolean requireRenameResilience() {
+    return false;
+  }
+
+  @Test
+  public void testResilienceAsExpected() throws Throwable {
+    Assertions.assertThat(isResilientCommit())
+        .describedAs("resilient commit support")
+        .isEqualTo(requireRenameResilience());
+  }
+
+  @Test
+  public void testRenameSourceException() throws Throwable {
+    describe("rename fails raising an IOE -expect stage to fail" +
+        " and exception message preserved");
+
+    // destination directory.
+    Path destDir = methodPath();
+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
+    Path jobAttemptTaskSubDir = stageConfig.getJobAttemptTaskSubDir();
+
+    // create a manifest with a lot of files, but for
+    // which one of whose renames will fail
+    TaskManifest manifest = new TaskManifest();
+    createFileset(destDir, jobAttemptTaskSubDir, manifest, filesToCreate());
+    final List<FileEntry> filesToCommit = manifest.getFilesToCommit();
+    final FileEntry entry = filesToCommit.get(FAILING_FILE_INDEX);
+    failures.addRenameSourceFilesToFail(entry.getSourcePath());
+
+    // rename MUST fail
+    expectRenameFailure(
+        new RenameFilesStage(stageConfig),
+        manifest,
+        filesToCommit.size(),
+        SIMULATED_FAILURE,
+        PathIOException.class);
+  }
+
+  /**
+   * Number of files to create; must be more than
+   * {@link #FAILING_FILE_INDEX}.
+   */
+  protected int filesToCreate() {
+    return 100;
+  }
+
+  @Test
+  public void testCommitMissingFile() throws Throwable {
+    describe("commit a file which doesn't exist. Expect FNFE always");
+    // destination directory.
+    Path destDir = methodPath();
+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
+    Path jobAttemptTaskSubDir = stageConfig.getJobAttemptTaskSubDir();
+    TaskManifest manifest = new TaskManifest();
+    final List<FileEntry> filesToCommit = manifest.getFilesToCommit();
+
+    Path source = new Path(jobAttemptTaskSubDir, "source.parquet");
+    Path dest = new Path(destDir, "destdir.parquet");
+    filesToCommit.add(new FileEntry(source, dest, 0, null));
+    final FileNotFoundException ex = expectRenameFailure(
+        new RenameFilesStage(stageConfig),
+        manifest,
+        0,
+        "",
+        FileNotFoundException.class);
+    LOG.info("Exception raised: {}", ex.toString());
+  }
+
+  /**
+   * Verify that when a job is configured to delete target paths,
+   * renaming will overwrite them.
+   * This test has to use FileSystem contract settings to determine
+   * whether or not the FS will actually permit file-over-file rename.
+   * As POSIX does, local filesystem tests will not fail if the
+   * destination exists.
+   * As ABFS and GCS do reject it, they are required to fail the
+   * first rename sequence, but succeed once delete.target.paths
+   * is true.
+   */
+  @Test
+  public void testDeleteTargetPaths() throws Throwable {
+    describe("Verify that target path deletion works");
+    // destination directory.
+    Path destDir = methodPath();
+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir)
+        .withDeleteTargetPaths(true);
+    Path jobAttemptTaskSubDir = stageConfig.getJobAttemptTaskSubDir();
+    final Path source = new Path(jobAttemptTaskSubDir, "source.txt");
+    final Path dest = new Path(destDir, "source.txt");
+    final byte[] sourceData = "data".getBytes(StandardCharsets.UTF_8);
+    final FileSystem fs = getFileSystem();
+    ContractTestUtils.createFile(fs, source, false, sourceData);
+    touch(fs, dest);
+    TaskManifest manifest = new TaskManifest();
+    final FileEntry entry = createEntryWithEtag(source, dest);
+    manifest.addFileToCommit(entry);
+
+    List<TaskManifest> manifests = new ArrayList<>();
+    manifests.add(manifest);
+
+    // local POSIX filesystems allow rename of file onto file, so
+    // don't fail on the rename.
+    boolean renameOverwritesDest = isSupported(RENAME_OVERWRITES_DEST);
+
+    if (!renameOverwritesDest) {
+      // HDFS, ABFS and GCS do all reject rename of file onto file.
+      // ABFS will use its rename operation so will even raise a
+      // meaningful exception here.
+      final IOException ex = expectRenameFailure(
+          new RenameFilesStage(stageConfig.withDeleteTargetPaths(false)),
+          manifest,
+          0,
+          "",
+          IOException.class);
+      LOG.info("Exception raised: {}", ex.toString());
+    }
+
+    // delete target paths and it works
+    new RenameFilesStage(stageConfig.withDeleteTargetPaths(true))
+        .apply(Pair.of(manifests, Collections.emptySet()));
+
+    // and the new data made it over
+    verifyFileContents(fs, dest, sourceData);
+
+    // lets check the etag too, for completeness
+    if (isEtagsPreserved()) {
+      Assertions.assertThat(getEtag(fs.getFileStatus(dest)))
+          .describedAs("Etag of destination file %s", dest)
+          .isEqualTo(entry.getEtag());
+    }
+
+  }
+
+  @Test
+  public void testRenameReturnsFalse() throws Throwable {
+    describe("commit where rename() returns false for one file." +
+        " Expect failure to be escalated to an IOE");
+
+    Assume.assumeTrue("not used when resilient commits are available",
+        !resilientCommit);
+    // destination directory.
+    Path destDir = methodPath();
+    StageConfig stageConfig = createStageConfigForJob(JOB1, destDir);
+    Path jobAttemptTaskSubDir = stageConfig.getJobAttemptTaskSubDir();
+
+    // create a manifest with a lot of files, but for
+    // which one of whose renames will fail
+    TaskManifest manifest = new TaskManifest();
+    createFileset(destDir, jobAttemptTaskSubDir, manifest, filesToCreate());
+
+    final List<FileEntry> filesToCommit = manifest.getFilesToCommit();
+    final FileEntry entry = filesToCommit.get(FAILING_FILE_INDEX);
+    failures.addRenameSourceFilesToFail(entry.getSourcePath());
+
+    // switch to rename returning false.; again, this must
+    // be escalated to a failure.
+    failures.setRenameToFailWithException(false);
+    expectRenameFailure(
+        new RenameFilesStage(stageConfig),
+        manifest,
+        filesToCommit.size(),
+        FAILED_TO_RENAME_PREFIX,
+        PathIOException.class);
+  }
+
+  /**
+   * Create the source files for a task.
+   * @param destDir destination directory
+   * @param taskAttemptDir directory of the task attempt
+   * @param manifest manifest to update.
+   * @param fileCount how many files.
+   */
+  private void createFileset(
+      final Path destDir,
+      final Path taskAttemptDir,
+      final TaskManifest manifest,
+      final int fileCount) throws IOException {
+    final FileSystem fs = getFileSystem();
+    for (int i = 0; i < fileCount; i++) {
+      String name = String.format("file%04d", i);
+      Path src = new Path(taskAttemptDir, name);
+      Path dest = new Path(destDir, name);
+      touch(fs, src);
+
+      final FileEntry entry = createEntryWithEtag(src, dest);
+      manifest.addFileToCommit(entry);
+    }
+  }
+
+  /**
+   * Create a manifest entry, including size.
+   * If the FS supports etags, one is retrieved.
+   * @param source source
+   * @param dest dest
+   * @return entry
+   * @throws IOException if getFileStatus failed.
+   */
+  private FileEntry createEntryWithEtag(final Path source,
+      final Path dest)
+      throws IOException {
+    final FileStatus st = getFileSystem().getFileStatus(source);
+    final String etag = isEtagsSupported()
+        ? getEtag(st)
+        : null;
+
+    return new FileEntry(source, dest, st.getLen(), etag);
+  }
+
+  /**
+   * Execute rename, expecting a failure.
+   * The number of files renamed MUST be less than the value of {@code files}
+   * @param stage stage
+   * @param manifest task manifests
+   * @param files number of files being renamed.
+   * @param errorText text which must be in the exception string
+   * @param exceptionClass class of the exception
+   * @return the caught exception
+   * @throws Exception if anything else went wrong, or no exception was raised.
+   */
+  private <E extends Throwable> E expectRenameFailure(
+      RenameFilesStage stage,
+      TaskManifest manifest,
+      int files,
+      String errorText,
+      Class<E> exceptionClass) throws Exception {
+
+    List<TaskManifest> manifests = new ArrayList<>();
+    manifests.add(manifest);
+    ProgressCounter progressCounter = getProgressCounter();
+    progressCounter.reset();
+    IOStatisticsStore iostatistics = stage.getIOStatistics();
+    long failures0 = iostatistics.counters().get(RENAME_FAILURES);
+
+    // rename MUST raise an exception.
+    E ex = intercept(exceptionClass, errorText, () ->
+        stage.apply(Pair.of(manifests, Collections.emptySet())));
+
+    LOG.info("Statistics {}", ioStatisticsToPrettyString(iostatistics));
+    // the IOStatistics record the rename as a failure.
+    assertThatStatisticCounter(iostatistics, RENAME_FAILURES)
+        .isEqualTo(failures0 + 1);
+
+    // count of files committed MUST be less than expected.
+    if (files > 0) {
+
+      Assertions.assertThat(stage.getFilesCommitted())
+          .describedAs("Files Committed by stage")
+          .isNotEmpty()
+          .hasSizeLessThan(files);
+
+    }
+
+    // the progress counter will show that the rename did invoke it.
+    // there's no assertion on the actual value as it depends on
+    // execution time of the threads.
+
+    Assertions.assertThat(progressCounter.value())
+        .describedAs("Progress counter %s", progressCounter)
+        .isGreaterThan(0);
+    return ex;
+  }
+}

+ 185 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestTaskManifestFileIO.java

@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.IOException;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.DirEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.util.JsonSerialization;
+
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.assertDirEntryMatch;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.assertFileEntryMatch;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.marshallPath;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData.unmarshallPath;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Test Reading/writing manifest files.
+ */
+public class TestTaskManifestFileIO extends AbstractManifestCommitterTest {
+
+  private TaskManifest source;
+
+  private ManifestCommitterTestSupport.JobAndTaskIDsForTests taskIDs;
+
+  private String taskAttempt00;
+
+  private Path testPath;
+
+  private Path taPath;
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    taskIDs = new ManifestCommitterTestSupport.JobAndTaskIDsForTests(2, 2);
+    source = new TaskManifest();
+    taskAttempt00 = taskIDs.getTaskAttempt(0, 0);
+    source.setTaskAttemptID(taskAttempt00);
+    testPath = methodPath();
+    taPath = new Path(testPath, "  " + taskAttempt00);
+    source.setTaskAttemptDir(marshallPath(taPath));
+  }
+
+  /**
+   * Test marshalling, paying attention to paths with spaces in them
+   * as they've been a source of trouble in the S3A committers.
+   */
+  @Test
+  public void testJsonRoundTrip() throws Throwable {
+    describe("Save manifest file to string and back");
+    Path subdirS = new Path(taPath, "subdir");
+    Path subdirD = new Path(testPath, "subdir");
+    source.addDirectory(DirEntry.dirEntry(subdirD, 0, 0));
+
+    // a file
+    Path subfileS = new Path(subdirS, "file");
+    Path subfileD = new Path(subdirD, "file");
+    long len = 256L;
+    FileEntry subFileEntry = new FileEntry(subfileS,
+        subfileD, len, "etag");
+    source.addFileToCommit(subFileEntry);
+
+
+    JsonSerialization<TaskManifest> serializer
+        = TaskManifest.serializer();
+
+    String json = serializer.toJson(source);
+    LOG.info("serialized form\n{}", json);
+    TaskManifest deser = serializer.fromJson(json);
+    deser.validate();
+
+    Assertions.assertThat(deser.getTaskAttemptID())
+        .describedAs("Task attempt ID")
+        .isEqualTo(taskAttempt00);
+
+    Assertions.assertThat(unmarshallPath(deser.getTaskAttemptDir()))
+        .describedAs("Task attempt Dir %s",
+            deser.getTaskAttemptDir())
+        .isEqualTo(taPath);
+
+    Assertions.assertThat(deser.getDestDirectories())
+        .hasSize(1)
+        .allSatisfy(d -> assertDirEntryMatch(d, subdirD, 0));
+    Assertions.assertThat(deser.getFilesToCommit())
+        .hasSize(1)
+        .allSatisfy(d -> assertFileEntryMatch(d, subfileS, subfileD, len));
+    final FileEntry entry = deser.getFilesToCommit().get(0);
+    assertFileEntryMatch(entry, subfileS, subfileD, len);
+    Assertions.assertThat(entry.getEtag())
+        .describedAs("etag of %s", entry)
+        .isEqualTo("etag");
+
+  }
+
+  /**
+   * The manifest validation logic has a safety check that only one
+   * file can rename to the same destination, and that the entries
+   * are valid.
+   */
+  @Test
+  public void testValidateRejectsTwoCommitsToSameDest() throws Throwable {
+
+    Path subdirS = new Path(taPath, "subdir");
+    Path subdirD = new Path(testPath, "subdir");
+    source.addDirectory(DirEntry.dirEntry(subdirD, 0, 0));
+
+    // a file
+    Path subfileS = new Path(subdirS, "file");
+    Path subfileS2 = new Path(subdirS, "file2");
+    Path subfileD = new Path(subdirD, "file");
+    long len = 256L;
+    source.addFileToCommit(
+        new FileEntry(subfileS, subfileD, len, "tag1"));
+    source.addFileToCommit(
+        new FileEntry(subfileS2, subfileD, len, "tag2"));
+    assertValidationFailureOnRoundTrip(source);
+  }
+
+  /**
+   * The manifest validation logic has a safety check that only one
+   * file can rename to the same destination, and that the entries
+   * are valid.
+   */
+  @Test
+  public void testValidateRejectsIncompleteFileEntry() throws Throwable {
+    source.addFileToCommit(
+        new FileEntry(taPath, null, 0, null));
+    assertValidationFailureOnRoundTrip(source);
+  }
+
+  /**
+   * negative lengths are not allowed.
+   */
+  @Test
+  public void testValidateRejectsInvalidFileLength() throws Throwable {
+    source.addFileToCommit(
+        new FileEntry(taPath, testPath, -1, null));
+    assertValidationFailureOnRoundTrip(source);
+  }
+
+  @Test
+  public void testRejectIncompatibleVersion() throws Throwable {
+    source.setVersion(5);
+    assertValidationFailureOnRoundTrip(source);
+  }
+
+  @Test
+  public void testRejectIncompatibleType() throws Throwable {
+    source.setType("Incompatible type");
+    assertValidationFailureOnRoundTrip(source);
+  }
+
+  private void assertValidationFailureOnRoundTrip(
+      final TaskManifest manifest) throws Exception {
+    JsonSerialization<TaskManifest> serializer
+        = TaskManifest.serializer();
+    String json = serializer.toJson(manifest);
+    LOG.info("serialized form\n{}", json);
+    TaskManifest deser = serializer.fromJson(json);
+    intercept(IOException.class, deser::validate);
+  }
+
+
+}

+ 136 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TextOutputForTests.java

@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.mapreduce.MRJobConfig;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ReflectionUtils;
+
+/**
+ * A subclass of {@link TextOutputFormat} which logs what is happening, and
+ * returns a {@link LoggingLineRecordWriter} which allows the caller
+ * to get the destination path.
+ * @param <K> key
+ * @param <V> value
+ */
+public class TextOutputForTests<K, V> extends TextOutputFormat<K, V> {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(TextOutputForTests.class);
+
+  public static final String NAME
+      = "org.apache.hadoop.fs.s3a.commit.LoggingTextOutputFormat";
+
+  @Override
+  public LoggingLineRecordWriter<K, V> getRecordWriter(TaskAttemptContext job)
+      throws IOException, InterruptedException {
+    Configuration conf = job.getConfiguration();
+    boolean isCompressed = getCompressOutput(job);
+    String keyValueSeparator = conf.get(SEPARATOR, "\t");
+    CompressionCodec codec = null;
+    String extension = "";
+    if (isCompressed) {
+      Class<? extends CompressionCodec> codecClass =
+          getOutputCompressorClass(job, GzipCodec.class);
+      codec = ReflectionUtils.newInstance(codecClass, conf);
+      extension = codec.getDefaultExtension();
+    }
+    Path file = getDefaultWorkFile(job, extension);
+    FileSystem fs = file.getFileSystem(conf);
+    FSDataOutputStream fileOut = fs.create(file, true);
+    LOG.debug("Creating LineRecordWriter with destination {}", file);
+    if (isCompressed) {
+      return new LoggingLineRecordWriter<>(
+          file, new DataOutputStream(codec.createOutputStream(fileOut)),
+          keyValueSeparator);
+    } else {
+      return new LoggingLineRecordWriter<>(file, fileOut, keyValueSeparator);
+    }
+  }
+
+  /**
+   * Write a line; counts the number of lines written and logs @ debug in the
+   * {@code close()} call.
+   * @param <K> key
+   * @param <V> value
+   */
+  public static class LoggingLineRecordWriter<K, V>
+      extends LineRecordWriter<K, V> {
+
+    private final Path dest;
+
+    private long lines;
+
+    public LoggingLineRecordWriter(Path dest, DataOutputStream out,
+        String keyValueSeparator) {
+      super(out, keyValueSeparator);
+      this.dest = dest;
+    }
+
+    public LoggingLineRecordWriter(DataOutputStream out, Path dest) {
+      super(out);
+      this.dest = dest;
+    }
+
+    @Override
+    public synchronized void write(K key, V value) throws IOException {
+      super.write(key, value);
+      lines++;
+    }
+
+    public synchronized void close(TaskAttemptContext context)
+        throws IOException {
+      LOG.debug("Closing output file {} with {} lines :{}",
+          dest, lines, out);
+      out.close();
+    }
+
+    public Path getDest() {
+      return dest;
+    }
+
+    public long getLines() {
+      return lines;
+    }
+  }
+
+  /**
+   * Bind to a configuration for job submission.
+   * @param conf configuration
+   */
+  public static void bind(Configuration conf) {
+    conf.setClass(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR,
+        TextOutputForTests.class,
+        OutputFormat.class);
+  }
+}

+ 83 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ThreadLeakTracker.java

@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+import org.assertj.core.api.Assertions;
+
+/**
+ * Thread Leak tracker.
+ * Initialized with the current set of threads and some known-long-life
+ * threads whose presence must not be considered a failure.
+ */
+public class ThreadLeakTracker {
+
+  private final Set<String> trackedThreads =
+      listInitialThreadsForLifecycleChecks();
+
+  public void assertNoThreadLeakage() {
+    Assertions.assertThat(getCurrentThreadNames())
+        .describedAs("The threads at the end of the test run")
+        .isSubsetOf(trackedThreads);
+  }
+
+  public Set<String> getTrackedThreads() {
+    return trackedThreads;
+  }
+
+  /**
+   * Get a set containing the names of all active threads,
+   * stripping out all test runner threads.
+   * @return the current set of threads.
+   */
+  public static Set<String> getCurrentThreadNames() {
+    TreeSet<String> threads = Thread.getAllStackTraces().keySet()
+        .stream()
+        .map(Thread::getName)
+        .filter(n -> n.startsWith("JUnit"))
+        .filter(n -> n.startsWith("surefire"))
+        .collect(Collectors.toCollection(TreeSet::new));
+    return threads;
+  }
+
+  /**
+   * This creates a set containing all current threads and some well-known
+   * thread names whose existence should not fail test runs.
+   * They are generally static cleaner threads created by various classes
+   * on instantiation.
+   * @return a set of threads to use in later assertions.
+   */
+  public static Set<String> listInitialThreadsForLifecycleChecks() {
+    Set<String> threadSet = getCurrentThreadNames();
+    // static filesystem statistics cleaner
+    threadSet.add("org.apache.hadoop.fs.FileSystem$Statistics$StatisticsDataReferenceCleaner");
+
+    // java.lang.UNIXProcess. maybe if chmod is called?
+    threadSet.add("process reaper");
+    // once a quantile has been scheduled, the mutable quantile thread pool
+    // is initialized; it has a minimum thread size of 1.
+    threadSet.add("MutableQuantiles-0");
+    // IDE?
+    threadSet.add("Attach Listener");
+    return threadSet;
+  }
+}

+ 380 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/UnreliableManifestStoreOperations.java

@@ -0,0 +1,380 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InterruptedIOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
+import org.apache.hadoop.util.JsonSerialization;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.OPERATION_TIMED_OUT;
+
+/**
+ * Wrap an existing {@link ManifestStoreOperations} implementation and fail on
+ * specific paths.
+ * This is for testing. It could be implemented via
+ * Mockito 2 spy code but is not so that:
+ * 1. It can be backported to Hadoop versions using Mockito 1.x.
+ * 2. It can be extended to use in production. This is why it is in
+ * the production module -to allow for downstream tests to adopt it.
+ * 3. You can actually debug what's going on.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class UnreliableManifestStoreOperations extends ManifestStoreOperations {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      UnreliableManifestStoreOperations.class);
+
+  /**
+   * The timeout message ABFS raises.
+   */
+  public static final String E_TIMEOUT
+      = "Operation could not be completed within the specified time";
+
+  /**
+   * Text to use in simulated failure exceptions.
+   */
+  public static final String SIMULATED_FAILURE = "Simulated failure";
+
+  /**
+   * Underlying store operations to wrap.
+   */
+  private final ManifestStoreOperations wrappedOperations;
+
+  /**
+   * Paths of delete operations to fail.
+   */
+  private final Set<Path> deletePathsToFail = new HashSet<>();
+
+  /**
+   * Paths of delete operations to time out, as ABFS may.
+   */
+  private final Set<Path> deletePathsToTimeOut = new HashSet<>();
+
+  /**
+   * Paths of List operations to fail.
+   */
+  private final Set<Path> listToFail = new HashSet<>();
+
+  /**
+   * Paths of mkdirs operations to fail.
+   */
+  private final Set<Path> mkdirsToFail = new HashSet<>();
+
+  /**
+   * Paths which don't exist.
+   */
+  private final Set<Path> pathNotFound = new HashSet<>();
+
+  /**
+   * Source file whose rename/commit will fail.
+   */
+  private final Set<Path> renameSourceFilesToFail = new HashSet<>();
+
+  /**
+   * Dest dir into which all renames/commits will fail.
+   * Subdirectories under this are not checked.
+   */
+  private final Set<Path> renameDestDirsToFail = new HashSet<>();
+
+  /**
+   * Path of save() to fail.
+   */
+  private final Set<Path> saveToFail = new HashSet<>();
+
+  /**
+   * timeout sleep.
+   */
+  private int timeoutSleepTimeMillis;
+
+  /**
+   * Should rename thrown an exception or just return false.
+   */
+  private boolean renameToFailWithException = true;
+
+  /**
+   * Constructor.
+   * @param wrappedOperations operations to wrap.
+   */
+  public UnreliableManifestStoreOperations(final ManifestStoreOperations wrappedOperations) {
+    this.wrappedOperations = wrappedOperations;
+  }
+
+
+  /**
+   * Reset everything.
+   */
+  public void reset() {
+    deletePathsToFail.clear();
+    deletePathsToTimeOut.clear();
+    pathNotFound.clear();
+    renameSourceFilesToFail.clear();
+    renameDestDirsToFail.clear();
+    timeoutSleepTimeMillis = 0;
+  }
+
+  public int getTimeoutSleepTimeMillis() {
+    return timeoutSleepTimeMillis;
+  }
+
+  public void setTimeoutSleepTimeMillis(final int timeoutSleepTimeMillis) {
+    this.timeoutSleepTimeMillis = timeoutSleepTimeMillis;
+  }
+
+  public boolean getRenameToFailWithException() {
+    return renameToFailWithException;
+  }
+
+  public void setRenameToFailWithException(
+      final boolean renameToFailWithException) {
+    this.renameToFailWithException = renameToFailWithException;
+  }
+
+  /**
+   * Add a path to the list of delete paths to fail.
+   * @param path path to add.
+   */
+  public void addDeletePathToFail(Path path) {
+    deletePathsToFail.add(requireNonNull(path));
+  }
+
+  /**
+   * Add a path to the list of delete paths to time out.
+   * @param path path to add.
+   */
+  public void addDeletePathToTimeOut(Path path) {
+    deletePathsToTimeOut.add(requireNonNull(path));
+  }
+
+  /**
+   * Add a path to the list of paths where list will fail.
+   * @param path path to add.
+   */
+  public void addListToFail(Path path) {
+    listToFail.add(requireNonNull(path));
+  }
+
+  /**
+   * Add a path to the list of mkdir calls to fail.
+   * @param path path to add.
+   */
+  public void addMkdirsToFail(Path path) {
+    mkdirsToFail.add(requireNonNull(path));
+  }
+
+  /**
+   * Add a path not found.
+   * @param path path
+   */
+  public void addPathNotFound(Path path) {
+    pathNotFound.add(requireNonNull(path));
+  }
+
+  /**
+   * Add a path to the list of rename source paths to fail.
+   * @param path path to add.
+   */
+  public void addRenameSourceFilesToFail(Path path) {
+    renameSourceFilesToFail.add(requireNonNull(path));
+  }
+
+  /**
+   * Add a path to the list of dest dirs to fail.
+   * @param path path to add.
+   */
+  public void addRenameDestDirsFail(Path path) {
+    renameDestDirsToFail.add(requireNonNull(path));
+  }
+
+  /**
+   * Add a path to the list of paths where save will fail.
+   * @param path path to add.
+   */
+  public void addSaveToFail(Path path) {
+    saveToFail.add(requireNonNull(path));
+  }
+
+  /**
+   * Raise an exception if the path is in the set of target paths.
+   * @param operation operation which failed.
+   * @param path path to check
+   * @param paths paths to probe for {@code path} being in.
+   * @throws IOException simulated failure
+   */
+  private void maybeRaiseIOE(String operation, Path path, Set<Path> paths)
+      throws IOException {
+    if (paths.contains(path)) {
+      LOG.info("Simulating failure of {} with {}", operation, path);
+      throw new PathIOException(path.toString(),
+          SIMULATED_FAILURE + " of " + operation);
+    }
+  }
+
+  /**
+   * Verify that a path is not on the file not found list.
+   * @param path path
+   * @throws FileNotFoundException if configured to fail.
+   */
+  private void verifyExists(Path path) throws FileNotFoundException {
+    if (pathNotFound.contains(path)) {
+      throw new FileNotFoundException(path.toString());
+    }
+  }
+
+  /**
+   * Time out if the path is in the list of timeout paths.
+   * Will sleep first, to help simulate delays.
+   * @param operation operation which failed.
+   * @param path path to check
+   * @param paths paths to probe for {@code path} being in.
+   * @throws IOException simulated timeout
+   */
+  private void maybeTimeout(String operation, Path path, Set<Path> paths)
+      throws IOException {
+    if (paths.contains(path)) {
+      LOG.info("Simulating timeout of {} with {}", operation, path);
+      try {
+        if (timeoutSleepTimeMillis > 0) {
+          Thread.sleep(timeoutSleepTimeMillis);
+        }
+      } catch (InterruptedException e) {
+        throw new InterruptedIOException(e.toString());
+      }
+      throw new PathIOException(path.toString(),
+          "ErrorCode=" + OPERATION_TIMED_OUT
+              + " ErrorMessage=" + E_TIMEOUT);
+    }
+  }
+
+  @Override
+  public FileStatus getFileStatus(final Path path) throws IOException {
+    verifyExists(path);
+    return wrappedOperations.getFileStatus(path);
+  }
+
+  @Override
+  public boolean delete(final Path path, final boolean recursive)
+      throws IOException {
+    String op = "delete";
+    maybeTimeout(op, path, deletePathsToTimeOut);
+    maybeRaiseIOE(op, path, deletePathsToFail);
+    return wrappedOperations.delete(path, recursive);
+  }
+
+  @Override
+  public boolean mkdirs(final Path path) throws IOException {
+    maybeRaiseIOE("mkdirs", path, mkdirsToFail);
+    return wrappedOperations.mkdirs(path);
+  }
+
+  @Override
+  public boolean renameFile(final Path source, final Path dest)
+      throws IOException {
+    String op = "rename";
+    if (renameToFailWithException) {
+      maybeRaiseIOE(op, source, renameSourceFilesToFail);
+      maybeRaiseIOE(op, dest.getParent(), renameDestDirsToFail);
+    } else {
+      if (renameSourceFilesToFail.contains(source)
+          || renameDestDirsToFail.contains(dest.getParent())) {
+        LOG.info("Failing rename({}, {})", source, dest);
+        return false;
+      }
+    }
+    return wrappedOperations.renameFile(source, dest);
+  }
+
+  @Override
+  public RemoteIterator<FileStatus> listStatusIterator(final Path path)
+      throws IOException {
+    verifyExists(path);
+    maybeRaiseIOE("listStatus", path, listToFail);
+    return wrappedOperations.listStatusIterator(path);
+  }
+
+  @Override
+  public TaskManifest loadTaskManifest(JsonSerialization<TaskManifest> serializer,
+      final FileStatus st) throws IOException {
+    verifyExists(st.getPath());
+    return wrappedOperations.loadTaskManifest(serializer, st);
+  }
+
+  @Override
+  public <T extends AbstractManifestData<T>> void save(T manifestData,
+      final Path path,
+      final boolean overwrite) throws IOException {
+    maybeRaiseIOE("save", path, saveToFail);
+    wrappedOperations.save(manifestData, path, overwrite);
+  }
+
+  @Override
+  public void msync(Path path) throws IOException {
+    wrappedOperations.msync(path);
+  }
+
+  @Override
+  public String getEtag(FileStatus status) {
+    return wrappedOperations.getEtag(status);
+  }
+
+  @Override
+  public boolean storeSupportsResilientCommit() {
+    return wrappedOperations.storeSupportsResilientCommit();
+  }
+
+  @Override
+  public CommitFileResult commitFile(final FileEntry entry)
+      throws IOException {
+    if (renameToFailWithException) {
+      maybeRaiseIOE("commitFile",
+          entry.getSourcePath(), renameSourceFilesToFail);
+      maybeRaiseIOE("commitFile",
+          entry.getDestPath().getParent(), renameDestDirsToFail);
+    }
+    return wrappedOperations.commitFile(entry);
+  }
+
+  @Override
+  public boolean storePreservesEtagsThroughRenames(Path path) {
+    return wrappedOperations.storePreservesEtagsThroughRenames(path);
+  }
+
+  @Override
+  public void close() throws IOException {
+    wrappedOperations.close();
+  }
+
+}

+ 29 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/package-info.java

@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * These test suites are subclassed in internal modules (hadoop-azure) and
+ * externally for gcs integration testing.
+ * Do test with both before making changes.
+ */
+@InterfaceAudience.LimitedPrivate("integration tests")
+@InterfaceStability.Unstable
+package org.apache.hadoop.mapreduce.lib.output.committer.manifest;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;

+ 138 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/contract/localfs.xml

@@ -0,0 +1,138 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~  or more contributor license agreements.  See the NOTICE file
+  ~  distributed with this work for additional information
+  ~  regarding copyright ownership.  The ASF licenses this file
+  ~  to you under the Apache License, Version 2.0 (the
+  ~  "License"); you may not use this file except in compliance
+  ~  with the License.  You may obtain a copy of the License at
+  ~
+  ~       http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~  Unless required by applicable law or agreed to in writing, software
+  ~  distributed under the License is distributed on an "AS IS" BASIS,
+  ~  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~  See the License for the specific language governing permissions and
+  ~  limitations under the License.
+  -->
+
+<configuration>
+
+<!--
+This is a copy of the file
+hadoop-common-project/hadoop-common/src/test/resources/contract/localfs.xml ;
+it is needed for all contract tests which test against the local filesystem.
+
+case sensitivity and permission options are determined at run time from OS type.
+-->
+
+  <property>
+    <name>fs.contract.is-case-sensitive</name>
+    <value>true</value>
+  </property>
+
+
+  <property>
+    <name>fs.contract.supports-unix-permissions</name>
+    <value>true</value>
+  </property>
+
+  <!--
+  The remaining options are static
+  -->
+
+  <property>
+    <name>fs.contract.test.root-tests-enabled</name>
+    <value>false</value>
+  </property>
+
+  <property>
+    <name>fs.contract.test.random-seek-count</name>
+    <value>1000</value>
+  </property>
+
+  <property>
+    <name>fs.contract.rename-creates-dest-dirs</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.rename-overwrites-dest</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.rename-remove-dest-if-empty-dir</name>
+    <value>true</value>
+  </property>
+
+  <!--
+  checksummed filesystems do not support append; see HADOOP-4292
+  -->
+  <property>
+    <name>fs.contract.supports-append</name>
+    <value>false</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-atomic-directory-delete</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-atomic-rename</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-block-locality</name>
+    <value>false</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-concat</name>
+    <value>false</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-seek</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-seek-on-closed-file</name>
+    <value>true</value>
+  </property>
+
+  <!-- checksum FS doesn't allow seeking past EOF -->
+  <property>
+    <name>fs.contract.rejects-seek-past-eof</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-strict-exceptions</name>
+    <value>false</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-settimes</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-getfilestatus</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-settimes</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-getfilestatus</name>
+    <value>true</value>
+  </property>
+
+</configuration>

+ 33 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/core-site.xml

@@ -0,0 +1,33 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~  or more contributor license agreements.  See the NOTICE file
+  ~  distributed with this work for additional information
+  ~  regarding copyright ownership.  The ASF licenses this file
+  ~  to you under the Apache License, Version 2.0 (the
+  ~  "License"); you may not use this file except in compliance
+  ~  with the License.  You may obtain a copy of the License at
+  ~
+  ~       http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~  Unless required by applicable law or agreed to in writing, software
+  ~  distributed under the License is distributed on an "AS IS" BASIS,
+  ~  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~  See the License for the specific language governing permissions and
+  ~  limitations under the License.
+  -->
+
+<!-- Values used when running unit tests.  Specify any values in here that
+     should override the default values. -->
+
+<configuration>
+
+  <property>
+    <name>hadoop.tmp.dir</name>
+    <value>target/build/test</value>
+    <description>A base for other temporary directories.</description>
+    <final>false</final>
+  </property>
+
+</configuration>