浏览代码

Merge trunk into HA branch.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1236333 13f79535-47bb-0310-9956-ffa450edef68
Aaron Myers 13 年之前
父节点
当前提交
1c95060a72
共有 67 个文件被更改,包括 10719 次插入151 次删除
  1. 15 13
      hadoop-common-project/hadoop-common/CHANGES.txt
  2. 11 1
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
  3. 32 0
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserFromEnv.java
  4. 2 2
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/HttpFSServer.java
  5. 2 0
      hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
  6. 18 0
      hadoop-mapreduce-project/CHANGES.txt
  7. 8 13
      hadoop-mapreduce-project/bin/mapred
  8. 26 12
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/MergeManager.java
  9. 7 10
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApp.java
  10. 29 1
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueUtils.java
  11. 6 2
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java
  12. 45 52
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
  13. 4 10
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
  14. 1 1
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java
  15. 62 16
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java
  16. 7 7
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java
  17. 31 1
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestQueueParsing.java
  18. 13 10
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesCapacitySched.java
  19. 18 0
      hadoop-project/pom.xml
  20. 7 0
      hadoop-tools/hadoop-distcp/README
  21. 198 0
      hadoop-tools/hadoop-distcp/pom.xml
  22. 218 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyListing.java
  23. 405 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCp.java
  24. 104 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
  25. 218 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java
  26. 525 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java
  27. 100 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/FileBasedCopyListing.java
  28. 105 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java
  29. 246 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java
  30. 275 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java
  31. 297 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java
  32. 330 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyMapper.java
  33. 124 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyOutputFormat.java
  34. 56 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableDirectoryCreateCommand.java
  35. 245 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java
  36. 169 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/UniformSizeInputFormat.java
  37. 246 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputChunk.java
  38. 292 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputFormat.java
  39. 203 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicRecordReader.java
  40. 343 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
  41. 106 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java
  42. 139 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/ThrottledInputStream.java
  43. 41 0
      hadoop-tools/hadoop-distcp/src/main/resources/distcp-default.xml
  44. 98 0
      hadoop-tools/hadoop-distcp/src/site/fml/faq.fml
  45. 47 0
      hadoop-tools/hadoop-distcp/src/site/pdf.xml
  46. 125 0
      hadoop-tools/hadoop-distcp/src/site/xdoc/appendix.xml
  47. 200 0
      hadoop-tools/hadoop-distcp/src/site/xdoc/architecture.xml
  48. 123 0
      hadoop-tools/hadoop-distcp/src/site/xdoc/cli.xml
  49. 32 0
      hadoop-tools/hadoop-distcp/src/site/xdoc/index.xml
  50. 147 0
      hadoop-tools/hadoop-distcp/src/site/xdoc/usage.xml
  51. 139 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/StubContext.java
  52. 252 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java
  53. 275 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCp.java
  54. 542 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestFileBasedCopyListing.java
  55. 135 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestGlobbedCopyListing.java
  56. 466 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestIntegration.java
  57. 497 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestOptionsParser.java
  58. 419 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyCommitter.java
  59. 826 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java
  60. 135 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyOutputFormat.java
  61. 254 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestUniformSizeInputFormat.java
  62. 162 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/lib/TestDynamicInputFormat.java
  63. 220 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestDistCpUtils.java
  64. 81 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestRetriableCommand.java
  65. 157 0
      hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestThrottledInputStream.java
  66. 57 0
      hadoop-tools/hadoop-distcp/src/test/resources/sslConfig.xml
  67. 1 0
      hadoop-tools/pom.xml

+ 15 - 13
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -35,11 +35,6 @@ Trunk (unreleased changes)
     HADOOP-7717. Move handling of concurrent client fail-overs to
     HADOOP-7717. Move handling of concurrent client fail-overs to
     RetryInvocationHandler (atm)
     RetryInvocationHandler (atm)
 
 
-    HADOOP-6490. Use StringUtils over String#replace in Path#normalizePath.
-    (Uma Maheswara Rao G via harsh)
-
-    HADOOP-7736. Remove duplicate Path#normalizePath call. (harsh)
-
     HADOOP-7664. Remove warmings when overriding final parameter configuration
     HADOOP-7664. Remove warmings when overriding final parameter configuration
     if the override value is same as the final parameter value.
     if the override value is same as the final parameter value.
     (Ravi Prakash via suresh)
     (Ravi Prakash via suresh)
@@ -68,12 +63,6 @@ Trunk (unreleased changes)
 
 
     HADOOP-7899. Generate proto java files as part of the build. (tucu)
     HADOOP-7899. Generate proto java files as part of the build. (tucu)
 
 
-    HADOOP-7574. Improve FSShell -stat, add user/group elements.
-    (XieXianshan via harsh)
-
-    HADOOP-7919. Remove the unused hadoop.logfile.* properties from the 
-    core-default.xml file. (harsh)
-
     HADOOP-7808. Port HADOOP-7510 - Add configurable option to use original 
     HADOOP-7808. Port HADOOP-7510 - Add configurable option to use original 
     hostname in token instead of IP to allow server IP change. 
     hostname in token instead of IP to allow server IP change. 
     (Daryn Sharp via suresh)
     (Daryn Sharp via suresh)
@@ -81,10 +70,10 @@ Trunk (unreleased changes)
     HADOOP-7957. Classes deriving GetGroupsBase should be able to override 
     HADOOP-7957. Classes deriving GetGroupsBase should be able to override 
     proxy creation. (jitendra)
     proxy creation. (jitendra)
 
 
-    HADOOP-4515. Configuration#getBoolean must not be case sensitive. (Sho Shimauchi via harsh)
-
     HADOOP-7968. Errant println left in RPC.getHighestSupportedProtocol (Sho Shimauchi via harsh)
     HADOOP-7968. Errant println left in RPC.getHighestSupportedProtocol (Sho Shimauchi via harsh)
 
 
+    HADOOP-7987. Support setting the run-as user in unsecure mode. (jitendra)
+
   BUGS
   BUGS
 
 
     HADOOP-7851. Configuration.getClasses() never returns the default value. 
     HADOOP-7851. Configuration.getClasses() never returns the default value. 
@@ -207,6 +196,19 @@ Release 0.23.1 - Unreleased
 
 
     HADOOP-7975. Add LZ4 as an entry in the default codec list, missed by HADOOP-7657 (harsh)
     HADOOP-7975. Add LZ4 as an entry in the default codec list, missed by HADOOP-7657 (harsh)
 
 
+    HADOOP-4515. Configuration#getBoolean must not be case sensitive. (Sho Shimauchi via harsh)
+
+    HADOOP-6490. Use StringUtils over String#replace in Path#normalizePath.
+    (Uma Maheswara Rao G via harsh)
+
+    HADOOP-7574. Improve FSShell -stat, add user/group elements.
+    (XieXianshan via harsh)
+
+    HADOOP-7736. Remove duplicate Path#normalizePath call. (harsh)
+
+    HADOOP-7919. Remove the unused hadoop.logfile.* properties from the 
+    core-default.xml file. (harsh)
+
   OPTIMIZATIONS
   OPTIMIZATIONS
 
 
   BUG FIXES
   BUG FIXES

+ 11 - 1
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java

@@ -80,6 +80,7 @@ public class UserGroupInformation {
    * Percentage of the ticket window to use before we renew ticket.
    * Percentage of the ticket window to use before we renew ticket.
    */
    */
   private static final float TICKET_RENEW_WINDOW = 0.80f;
   private static final float TICKET_RENEW_WINDOW = 0.80f;
+  static final String HADOOP_USER_NAME = "HADOOP_USER_NAME";
   
   
   /** 
   /** 
    * UgiMetrics maintains UGI activity statistics
    * UgiMetrics maintains UGI activity statistics
@@ -137,7 +138,16 @@ public class UserGroupInformation {
           LOG.debug("using kerberos user:"+user);
           LOG.debug("using kerberos user:"+user);
         }
         }
       }
       }
-      // if we don't have a kerberos user, use the OS user
+      //If we don't have a kerberos user and security is disabled, check
+      //if user is specified in the environment or properties
+      if (!isSecurityEnabled() && (user == null)) {
+        String envUser = System.getenv(HADOOP_USER_NAME);
+        if (envUser == null) {
+          envUser = System.getProperty(HADOOP_USER_NAME);
+        }
+        user = envUser == null ? null : new User(envUser);
+      }
+      // use the OS user
       if (user == null) {
       if (user == null) {
         user = getCanonicalUser(OS_PRINCIPAL_CLASS);
         user = getCanonicalUser(OS_PRINCIPAL_CLASS);
         if (LOG.isDebugEnabled()) {
         if (LOG.isDebugEnabled()) {

+ 32 - 0
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserFromEnv.java

@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.security;
+
+import java.io.IOException;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestUserFromEnv {
+
+  @Test
+  public void testUserFromEnvironment() throws IOException {
+    System.setProperty(UserGroupInformation.HADOOP_USER_NAME, "randomUser");
+    Assert.assertEquals("randomUser", UserGroupInformation.getLoginUser()
+        .getUserName());
+  }
+}

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/HttpFSServer.java

@@ -219,7 +219,7 @@ public class HttpFSServer {
    * operation is @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.GetOpValues#LISTSTATUS}
    * operation is @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.GetOpValues#LISTSTATUS}
    * @param doAs user being impersonated, defualt value is none. It can be used
    * @param doAs user being impersonated, defualt value is none. It can be used
    * only if the current user is a HttpFSServer proxyuser.
    * only if the current user is a HttpFSServer proxyuser.
-   * @param override, default is true. Used only for
+   * @param override default is true. Used only for
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
    * @param blockSize block size to set, used only by
    * @param blockSize block size to set, used only by
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
@@ -419,7 +419,7 @@ public class HttpFSServer {
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
    * @param group group to set, used only for
    * @param group group to set, used only for
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
-   * @param override, default is true. Used only for
+   * @param override default is true. Used only for
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
    * @param blockSize block size to set, used only by
    * @param blockSize block size to set, used only by
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.

+ 2 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -363,6 +363,8 @@ Release 0.23.1 - UNRELEASED
 
 
     HDFS-442. dfsthroughput in test jar throws NPE (harsh)
     HDFS-442. dfsthroughput in test jar throws NPE (harsh)
 
 
+    HDFS-2836. HttpFSServer still has 2 javadoc warnings in trunk (revans2 via tucu)
+
 Release 0.23.0 - 2011-11-01 
 Release 0.23.0 - 2011-11-01 
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 18 - 0
hadoop-mapreduce-project/CHANGES.txt

@@ -52,6 +52,9 @@ Trunk (unreleased changes)
     MAPREDUCE-2944. Improve checking of input for JobClient.displayTasks() (XieXianshan via harsh)
     MAPREDUCE-2944. Improve checking of input for JobClient.displayTasks() (XieXianshan via harsh)
 
 
   BUG FIXES
   BUG FIXES
+    MAPREDUCE-3194. "mapred mradmin" command is broken in mrv2
+                     (Jason Lowe via bobby)
+
     MAPREDUCE-3462. Fix Gridmix JUnit testcase failures. 
     MAPREDUCE-3462. Fix Gridmix JUnit testcase failures. 
                     (Ravi Prakash and Ravi Gummadi via amarrk)
                     (Ravi Prakash and Ravi Gummadi via amarrk)
 
 
@@ -192,6 +195,8 @@ Release 0.23.1 - Unreleased
     MAPREDUCE-3710. Improved FileInputFormat to return better locality for the
     MAPREDUCE-3710. Improved FileInputFormat to return better locality for the
     last split. (Siddarth Seth via vinodkv)
     last split. (Siddarth Seth via vinodkv)
 
 
+    MAPREDUCE-2765. DistCp Rewrite. (Mithun Radhakrishnan via mahadev)
+
   OPTIMIZATIONS
   OPTIMIZATIONS
 
 
     MAPREDUCE-3567. Extraneous JobConf objects in AM heap. (Vinod Kumar
     MAPREDUCE-3567. Extraneous JobConf objects in AM heap. (Vinod Kumar
@@ -558,6 +563,19 @@ Release 0.23.1 - Unreleased
     MAPREDUCE-3630. Fixes a NullPointer exception while running TeraGen - if a
     MAPREDUCE-3630. Fixes a NullPointer exception while running TeraGen - if a
     map is asked to generate 0 records. (Mahadev Konar via sseth)
     map is asked to generate 0 records. (Mahadev Konar via sseth)
 
 
+    MAPREDUCE-3683. Fixed maxCapacity of queues to be product of parent
+    maxCapacities. (acmurthy)
+
+    MAPREDUCE-3713. Fixed the way head-room is allocated to applications by
+    CapacityScheduler so that it deducts current-usage per user and not
+    per-application. (Arun C Murthy via vinodkv)
+
+    MAPREDUCE-3721. Fixed a race in shuffle which caused reduces to hang.
+    (sseth via acmurthy) 
+
+    MAPREDUCE-3733. Add Apache License Header to hadoop-distcp/pom.xml.
+    (mahadev)
+
 Release 0.23.0 - 2011-11-01 
 Release 0.23.0 - 2011-11-01 
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 8 - 13
hadoop-mapreduce-project/bin/mapred

@@ -30,9 +30,6 @@ fi
 function print_usage(){
 function print_usage(){
   echo "Usage: mapred [--config confdir] COMMAND"
   echo "Usage: mapred [--config confdir] COMMAND"
   echo "       where COMMAND is one of:"
   echo "       where COMMAND is one of:"
-  echo "  mradmin              run a Map-Reduce admin client"
-  echo "  jobtracker           run the MapReduce job Tracker node" 
-  echo "  tasktracker          run a MapReduce task Tracker node" 
   echo "  pipes                run a Pipes job"
   echo "  pipes                run a Pipes job"
   echo "  job                  manipulate MapReduce jobs"
   echo "  job                  manipulate MapReduce jobs"
   echo "  queue                get information regarding JobQueues"
   echo "  queue                get information regarding JobQueues"
@@ -51,16 +48,7 @@ fi
 COMMAND=$1
 COMMAND=$1
 shift
 shift
 
 
-if [ "$COMMAND" = "mradmin" ] ; then
-  CLASS=org.apache.hadoop.mapred.tools.MRAdmin
-  HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
-elif [ "$COMMAND" = "jobtracker" ] ; then
-  CLASS=org.apache.hadoop.mapred.JobTracker
-  HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
-elif [ "$COMMAND" = "tasktracker" ] ; then
-  CLASS=org.apache.hadoop.mapred.TaskTracker
-  HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
-elif [ "$COMMAND" = "job" ] ; then
+if [ "$COMMAND" = "job" ] ; then
   CLASS=org.apache.hadoop.mapred.JobClient
   CLASS=org.apache.hadoop.mapred.JobClient
 elif [ "$COMMAND" = "queue" ] ; then
 elif [ "$COMMAND" = "queue" ] ; then
   CLASS=org.apache.hadoop.mapred.JobQueueClient
   CLASS=org.apache.hadoop.mapred.JobQueueClient
@@ -75,6 +63,13 @@ elif [ "$COMMAND" = "classpath" ] ; then
 elif [ "$COMMAND" = "groups" ] ; then
 elif [ "$COMMAND" = "groups" ] ; then
   CLASS=org.apache.hadoop.mapred.tools.GetGroups
   CLASS=org.apache.hadoop.mapred.tools.GetGroups
   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "mradmin" ] \
+    || [ "$COMMAND" = "jobtracker" ] \
+    || [ "$COMMAND" = "tasktracker" ] ; then
+  echo "Sorry, the $COMMAND command is no longer supported."
+  echo "You may find similar functionality with the \"yarn\" shell command."
+  print_usage
+  exit
 else
 else
   echo $COMMAND - invalid command
   echo $COMMAND - invalid command
   print_usage
   print_usage

+ 26 - 12
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/MergeManager.java

@@ -92,6 +92,7 @@ public class MergeManager<K, V> {
   
   
   private final long memoryLimit;
   private final long memoryLimit;
   private long usedMemory;
   private long usedMemory;
+  private long commitMemory;
   private final long maxSingleShuffleLimit;
   private final long maxSingleShuffleLimit;
   
   
   private final int memToMemMergeOutputsThreshold; 
   private final int memToMemMergeOutputsThreshold; 
@@ -181,6 +182,13 @@ public class MergeManager<K, V> {
              "ioSortFactor=" + ioSortFactor + ", " +
              "ioSortFactor=" + ioSortFactor + ", " +
              "memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold);
              "memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold);
 
 
+    if (this.maxSingleShuffleLimit >= this.mergeThreshold) {
+      throw new RuntimeException("Invlaid configuration: "
+          + "maxSingleShuffleLimit should be less than mergeThreshold"
+          + "maxSingleShuffleLimit: " + this.maxSingleShuffleLimit
+          + "mergeThreshold: " + this.mergeThreshold);
+    }
+
     boolean allowMemToMemMerge = 
     boolean allowMemToMemMerge = 
       jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false);
       jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false);
     if (allowMemToMemMerge) {
     if (allowMemToMemMerge) {
@@ -245,16 +253,16 @@ public class MergeManager<K, V> {
     // all the stalled threads
     // all the stalled threads
     
     
     if (usedMemory > memoryLimit) {
     if (usedMemory > memoryLimit) {
-      LOG.debug(mapId + ": Stalling shuffle since usedMemory (" + usedMemory + 
-               ") is greater than memoryLimit (" + memoryLimit + ")"); 
-      
+      LOG.debug(mapId + ": Stalling shuffle since usedMemory (" + usedMemory
+          + ") is greater than memoryLimit (" + memoryLimit + ")." + 
+          " CommitMemory is (" + commitMemory + ")"); 
       return stallShuffle;
       return stallShuffle;
     }
     }
     
     
     // Allow the in-memory shuffle to progress
     // Allow the in-memory shuffle to progress
-    LOG.debug(mapId + ": Proceeding with shuffle since usedMemory (" +
-        usedMemory + 
-        ") is lesser than memoryLimit (" + memoryLimit + ")"); 
+    LOG.debug(mapId + ": Proceeding with shuffle since usedMemory ("
+        + usedMemory + ") is lesser than memoryLimit (" + memoryLimit + ")."
+        + "CommitMemory is (" + commitMemory + ")"); 
     return unconditionalReserve(mapId, requestedSize, true);
     return unconditionalReserve(mapId, requestedSize, true);
   }
   }
   
   
@@ -270,18 +278,24 @@ public class MergeManager<K, V> {
   }
   }
   
   
   synchronized void unreserve(long size) {
   synchronized void unreserve(long size) {
+    commitMemory -= size;
     usedMemory -= size;
     usedMemory -= size;
   }
   }
-  
+
   public synchronized void closeInMemoryFile(MapOutput<K,V> mapOutput) { 
   public synchronized void closeInMemoryFile(MapOutput<K,V> mapOutput) { 
     inMemoryMapOutputs.add(mapOutput);
     inMemoryMapOutputs.add(mapOutput);
     LOG.info("closeInMemoryFile -> map-output of size: " + mapOutput.getSize()
     LOG.info("closeInMemoryFile -> map-output of size: " + mapOutput.getSize()
-        + ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size());
-    
+        + ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size()
+        + ", commitMemory -> " + commitMemory + ", usedMemory ->" + usedMemory);
+
+    commitMemory+= mapOutput.getSize();
+
     synchronized (inMemoryMerger) {
     synchronized (inMemoryMerger) {
-      if (!inMemoryMerger.isInProgress() && usedMemory >= mergeThreshold) {
-        LOG.info("Starting inMemoryMerger's merge since usedMemory=" +
-            usedMemory + " > mergeThreshold=" + mergeThreshold);
+      // Can hang if mergeThreshold is really low.
+      if (!inMemoryMerger.isInProgress() && commitMemory >= mergeThreshold) {
+        LOG.info("Starting inMemoryMerger's merge since commitMemory=" +
+            commitMemory + " > mergeThreshold=" + mergeThreshold + 
+            ". Current usedMemory=" + usedMemory);
         inMemoryMapOutputs.addAll(inMemoryMergedMapOutputs);
         inMemoryMapOutputs.addAll(inMemoryMergedMapOutputs);
         inMemoryMergedMapOutputs.clear();
         inMemoryMergedMapOutputs.clear();
         inMemoryMerger.startMerge(inMemoryMapOutputs);
         inMemoryMerger.startMerge(inMemoryMapOutputs);

+ 7 - 10
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApp.java

@@ -295,10 +295,6 @@ public class SchedulerApp {
     }
     }
   }
   }
 
 
-  public synchronized void setAvailableResourceLimit(Resource globalLimit) {
-    this.resourceLimit = globalLimit; 
-  }
-
   public synchronized RMContainer getRMContainer(ContainerId id) {
   public synchronized RMContainer getRMContainer(ContainerId id) {
     return liveContainers.get(id);
     return liveContainers.get(id);
   }
   }
@@ -446,20 +442,21 @@ public class SchedulerApp {
     return reservedContainers;
     return reservedContainers;
   }
   }
   
   
+  public synchronized void setHeadroom(Resource globalLimit) {
+    this.resourceLimit = globalLimit; 
+  }
+
   /**
   /**
    * Get available headroom in terms of resources for the application's user.
    * Get available headroom in terms of resources for the application's user.
    * @return available resource headroom
    * @return available resource headroom
    */
    */
   public synchronized Resource getHeadroom() {
   public synchronized Resource getHeadroom() {
-    Resource limit = Resources.subtract(resourceLimit, currentConsumption);
-    Resources.subtractFrom(limit, currentReservation);
-
     // Corner case to deal with applications being slightly over-limit
     // Corner case to deal with applications being slightly over-limit
-    if (limit.getMemory() < 0) {
-      limit.setMemory(0);
+    if (resourceLimit.getMemory() < 0) {
+      resourceLimit.setMemory(0);
     }
     }
     
     
-    return limit;
+    return resourceLimit;
   }
   }
 
 
   public Queue getQueue() {
   public Queue getQueue() {

+ 29 - 1
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueUtils.java

@@ -17,12 +17,19 @@
 */
 */
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
 
 
+import org.apache.hadoop.yarn.api.records.Resource;
+
 class CSQueueUtils {
 class CSQueueUtils {
   
   
   public static void checkMaxCapacity(String queueName, 
   public static void checkMaxCapacity(String queueName, 
       float capacity, float maximumCapacity) {
       float capacity, float maximumCapacity) {
-    if (Math.round(100 * maximumCapacity) != CapacitySchedulerConfiguration.UNDEFINED && 
+    if (maximumCapacity < 0.0f || maximumCapacity > 1.0f || 
         maximumCapacity < capacity) {
         maximumCapacity < capacity) {
+      throw new IllegalArgumentException(
+          "Illegal value  of maximumCapacity " + maximumCapacity + 
+          " used in call to setMaxCapacity for queue " + queueName);
+    }
+    if (maximumCapacity < capacity) {
       throw new IllegalArgumentException(
       throw new IllegalArgumentException(
           "Illegal call to setMaxCapacity. " +
           "Illegal call to setMaxCapacity. " +
           "Queue '" + queueName + "' has " +
           "Queue '" + queueName + "' has " +
@@ -30,5 +37,26 @@ class CSQueueUtils {
           "maximumCapacity (" + maximumCapacity + ")" );
           "maximumCapacity (" + maximumCapacity + ")" );
     }
     }
   }
   }
+
+  public static float computeAbsoluteMaximumCapacity(
+      float maximumCapacity, CSQueue parent) {
+    float parentAbsMaxCapacity = 
+        (parent == null) ? 1.0f : parent.getAbsoluteMaximumCapacity();
+    return (parentAbsMaxCapacity * maximumCapacity);
+  }
+
+  public static int computeMaxActiveApplications(Resource clusterResource,
+      float maxAMResourcePercent, float absoluteCapacity) {
+    return 
+        Math.max(
+            (int)((clusterResource.getMemory() / (float)LeafQueue.DEFAULT_AM_RESOURCE) * 
+                   maxAMResourcePercent * absoluteCapacity), 
+            1);
+  }
+
+  public static int computeMaxActiveApplicationsPerUser(
+      int maxActiveApplications, int userLimit, float userLimitFactor) {
+    return (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor);
+  }
   
   
 }
 }

+ 6 - 2
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java

@@ -149,7 +149,7 @@ public class CapacitySchedulerConfiguration extends Configuration {
       throw new IllegalArgumentException("Illegal " +
       throw new IllegalArgumentException("Illegal " +
       		"capacity of " + capacity + " for queue " + queue);
       		"capacity of " + capacity + " for queue " + queue);
     }
     }
-    LOG.debug("CSConf - setCapacity: queuePrefix=" + getQueuePrefix(queue) + 
+    LOG.debug("CSConf - getCapacity: queuePrefix=" + getQueuePrefix(queue) + 
         ", capacity=" + capacity);
         ", capacity=" + capacity);
     return capacity;
     return capacity;
   }
   }
@@ -162,11 +162,15 @@ public class CapacitySchedulerConfiguration extends Configuration {
 
 
   public int getMaximumCapacity(String queue) {
   public int getMaximumCapacity(String queue) {
     int maxCapacity = 
     int maxCapacity = 
-      getInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, UNDEFINED);
+      getInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, MAXIMUM_CAPACITY_VALUE);
     return maxCapacity;
     return maxCapacity;
   }
   }
   
   
   public void setMaximumCapacity(String queue, int maxCapacity) {
   public void setMaximumCapacity(String queue, int maxCapacity) {
+    if (maxCapacity > MAXIMUM_CAPACITY_VALUE) {
+      throw new IllegalArgumentException("Illegal " +
+          "maximum-capacity of " + maxCapacity + " for queue " + queue);
+    }
     setInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, maxCapacity);
     setInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, maxCapacity);
     LOG.debug("CSConf - setMaxCapacity: queuePrefix=" + getQueuePrefix(queue) + 
     LOG.debug("CSConf - setMaxCapacity: queuePrefix=" + getQueuePrefix(queue) + 
         ", maxCapacity=" + maxCapacity);
         ", maxCapacity=" + maxCapacity);

+ 45 - 52
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java

@@ -144,10 +144,10 @@ public class LeafQueue implements CSQueue {
       (float)cs.getConfiguration().getCapacity(getQueuePath()) / 100;
       (float)cs.getConfiguration().getCapacity(getQueuePath()) / 100;
     float absoluteCapacity = parent.getAbsoluteCapacity() * capacity;
     float absoluteCapacity = parent.getAbsoluteCapacity() * capacity;
 
 
-    float maximumCapacity = (float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
+    float maximumCapacity = 
+        (float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
     float absoluteMaxCapacity = 
     float absoluteMaxCapacity = 
-      (Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ? 
-          Float.MAX_VALUE : (parent.getAbsoluteCapacity() * maximumCapacity);
+        CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
 
 
     int userLimit = cs.getConfiguration().getUserLimit(getQueuePath());
     int userLimit = cs.getConfiguration().getUserLimit(getQueuePath());
     float userLimitFactor = 
     float userLimitFactor = 
@@ -161,10 +161,10 @@ public class LeafQueue implements CSQueue {
     this.maxAMResourcePercent = 
     this.maxAMResourcePercent = 
         cs.getConfiguration().getMaximumApplicationMasterResourcePercent();
         cs.getConfiguration().getMaximumApplicationMasterResourcePercent();
     int maxActiveApplications = 
     int maxActiveApplications = 
-        computeMaxActiveApplications(cs.getClusterResources(), 
+        CSQueueUtils.computeMaxActiveApplications(cs.getClusterResources(), 
             maxAMResourcePercent, absoluteCapacity);
             maxAMResourcePercent, absoluteCapacity);
     int maxActiveApplicationsPerUser = 
     int maxActiveApplicationsPerUser = 
-        computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit, 
+        CSQueueUtils.computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit, 
             userLimitFactor);
             userLimitFactor);
 
 
     this.queueInfo = recordFactory.newRecordInstance(QueueInfo.class);
     this.queueInfo = recordFactory.newRecordInstance(QueueInfo.class);
@@ -193,20 +193,6 @@ public class LeafQueue implements CSQueue {
     this.activeApplications = new TreeSet<SchedulerApp>(applicationComparator);
     this.activeApplications = new TreeSet<SchedulerApp>(applicationComparator);
   }
   }
 
 
-  private int computeMaxActiveApplications(Resource clusterResource,
-      float maxAMResourcePercent, float absoluteCapacity) {
-    return 
-        Math.max(
-            (int)((clusterResource.getMemory() / (float)DEFAULT_AM_RESOURCE) * 
-                   maxAMResourcePercent * absoluteCapacity), 
-            1);
-  }
-  
-  private int computeMaxActiveApplicationsPerUser(int maxActiveApplications, 
-      int userLimit, float userLimitFactor) {
-    return (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor);
-  }
-  
   private synchronized void setupQueueConfigs(
   private synchronized void setupQueueConfigs(
       float capacity, float absoluteCapacity, 
       float capacity, float absoluteCapacity, 
       float maximumCapacity, float absoluteMaxCapacity,
       float maximumCapacity, float absoluteMaxCapacity,
@@ -254,8 +240,8 @@ public class LeafQueue implements CSQueue {
         "maxCapacity = " + maximumCapacity +
         "maxCapacity = " + maximumCapacity +
         " [= configuredMaxCapacity ]" + "\n" +
         " [= configuredMaxCapacity ]" + "\n" +
         "absoluteMaxCapacity = " + absoluteMaxCapacity +
         "absoluteMaxCapacity = " + absoluteMaxCapacity +
-        " [= Float.MAX_VALUE if maximumCapacity undefined, " +
-        "(parentAbsoluteCapacity * maximumCapacity) / 100 otherwise ]" + "\n" +
+        " [= 1.0 maximumCapacity undefined, " +
+        "(parentAbsoluteMaxCapacity * maximumCapacity) / 100 otherwise ]" + "\n" +
         "userLimit = " + userLimit +
         "userLimit = " + userLimit +
         " [= configuredUserLimit ]" + "\n" +
         " [= configuredUserLimit ]" + "\n" +
         "userLimitFactor = " + userLimitFactor +
         "userLimitFactor = " + userLimitFactor +
@@ -400,9 +386,7 @@ public class LeafQueue implements CSQueue {
     
     
     this.maximumCapacity = maximumCapacity;
     this.maximumCapacity = maximumCapacity;
     this.absoluteMaxCapacity = 
     this.absoluteMaxCapacity = 
-      (Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ? 
-          Float.MAX_VALUE : 
-          (parent.getAbsoluteCapacity() * maximumCapacity);
+        CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
   }
   }
   
   
   /**
   /**
@@ -736,12 +720,11 @@ public class LeafQueue implements CSQueue {
       if(LOG.isDebugEnabled()) {
       if(LOG.isDebugEnabled()) {
         LOG.debug("pre-assignContainers for application "
         LOG.debug("pre-assignContainers for application "
         + application.getApplicationId());
         + application.getApplicationId());
+        application.showRequests();
       }
       }
-      application.showRequests();
 
 
       synchronized (application) {
       synchronized (application) {
-        computeAndSetUserResourceLimit(application, clusterResource);
-        
+        // Schedule in priority order
         for (Priority priority : application.getPriorities()) {
         for (Priority priority : application.getPriorities()) {
           // Required resource
           // Required resource
           Resource required = 
           Resource required = 
@@ -752,15 +735,21 @@ public class LeafQueue implements CSQueue {
             continue;
             continue;
           }
           }
 
 
-          // Are we going over limits by allocating to this application?
-          // Maximum Capacity of the queue
+          // Compute & set headroom
+          // Note: We set the headroom with the highest priority request 
+          //       as the target. 
+          //       This works since we never assign lower priority requests
+          //       before all higher priority ones are serviced.
+          Resource userLimit = 
+              computeAndSetUserResourceLimit(application, clusterResource, 
+                  required);
+
+          // Check queue max-capacity limit
           if (!assignToQueue(clusterResource, required)) {
           if (!assignToQueue(clusterResource, required)) {
             return NULL_ASSIGNMENT;
             return NULL_ASSIGNMENT;
           }
           }
 
 
-          // User limits
-          Resource userLimit = 
-            computeUserLimit(application, clusterResource, required); 
+          // Check user limit
           if (!assignToUser(application.getUser(), userLimit)) {
           if (!assignToUser(application.getUser(), userLimit)) {
             break; 
             break; 
           }
           }
@@ -774,7 +763,7 @@ public class LeafQueue implements CSQueue {
                 null);
                 null);
           
           
           Resource assigned = assignment.getResource();
           Resource assigned = assignment.getResource();
-            
+          
           // Did we schedule or reserve a container?
           // Did we schedule or reserve a container?
           if (Resources.greaterThan(assigned, Resources.none())) {
           if (Resources.greaterThan(assigned, Resources.none())) {
 
 
@@ -835,25 +824,28 @@ public class LeafQueue implements CSQueue {
     float potentialNewCapacity = 
     float potentialNewCapacity = 
       (float)(usedResources.getMemory() + required.getMemory()) / 
       (float)(usedResources.getMemory() + required.getMemory()) / 
         clusterResource.getMemory();
         clusterResource.getMemory();
-    LOG.info(getQueueName() + 
-        " usedResources: " + usedResources.getMemory() + 
-        " currentCapacity " + ((float)usedResources.getMemory())/clusterResource.getMemory() + 
-        " required " + required.getMemory() +
-        " potentialNewCapacity: " + potentialNewCapacity + " ( " +
-        " max-capacity: " + absoluteMaxCapacity + ")");
     if (potentialNewCapacity > absoluteMaxCapacity) {
     if (potentialNewCapacity > absoluteMaxCapacity) {
+      LOG.info(getQueueName() + 
+          " usedResources: " + usedResources.getMemory() +
+          " clusterResources: " + clusterResource.getMemory() +
+          " currentCapacity " + ((float)usedResources.getMemory())/clusterResource.getMemory() + 
+          " required " + required.getMemory() +
+          " potentialNewCapacity: " + potentialNewCapacity + " ( " +
+          " max-capacity: " + absoluteMaxCapacity + ")");
       return false;
       return false;
     }
     }
     return true;
     return true;
   }
   }
 
 
-  private void computeAndSetUserResourceLimit(SchedulerApp application, 
-      Resource clusterResource) {
-    Resource userLimit = 
-        computeUserLimit(application, clusterResource, Resources.none());
-    application.setAvailableResourceLimit(userLimit);
-    metrics.setAvailableResourcesToUser(application.getUser(), 
-        application.getHeadroom());
+  private Resource computeAndSetUserResourceLimit(SchedulerApp application, 
+      Resource clusterResource, Resource required) {
+    String user = application.getUser();
+    Resource limit = computeUserLimit(application, clusterResource, required);
+    Resource headroom = 
+        Resources.subtract(limit, getUser(user).getConsumedResources());
+    application.setHeadroom(headroom);
+    metrics.setAvailableResourcesToUser(user, headroom);
+    return limit;
   }
   }
   
   
   private int roundUp(int memory) {
   private int roundUp(int memory) {
@@ -924,7 +916,7 @@ public class LeafQueue implements CSQueue {
     User user = getUser(userName);
     User user = getUser(userName);
     
     
     // Note: We aren't considering the current request since there is a fixed
     // Note: We aren't considering the current request since there is a fixed
-    // overhead of the AM, but it's a >= check, so... 
+    // overhead of the AM, but it's a > check, not a >= check, so... 
     if ((user.getConsumedResources().getMemory()) > limit.getMemory()) {
     if ((user.getConsumedResources().getMemory()) > limit.getMemory()) {
       if (LOG.isDebugEnabled()) {
       if (LOG.isDebugEnabled()) {
         LOG.debug("User " + userName + " in queue " + getQueueName() + 
         LOG.debug("User " + userName + " in queue " + getQueueName() + 
@@ -1242,8 +1234,8 @@ public class LeafQueue implements CSQueue {
         // happen under scheduler's lock... 
         // happen under scheduler's lock... 
         // So, this is, in effect, a transaction across application & node
         // So, this is, in effect, a transaction across application & node
         if (rmContainer.getState() == RMContainerState.RESERVED) {
         if (rmContainer.getState() == RMContainerState.RESERVED) {
-          application.unreserve(node, rmContainer.getReservedPriority());
-          node.unreserveResource(application);
+          unreserve(application, rmContainer.getReservedPriority(), 
+              node, rmContainer);
         } else {
         } else {
           application.containerCompleted(rmContainer, containerStatus, event);
           application.containerCompleted(rmContainer, containerStatus, event);
           node.releaseContainer(container);
           node.releaseContainer(container);
@@ -1308,15 +1300,16 @@ public class LeafQueue implements CSQueue {
   public synchronized void updateClusterResource(Resource clusterResource) {
   public synchronized void updateClusterResource(Resource clusterResource) {
     // Update queue properties
     // Update queue properties
     maxActiveApplications = 
     maxActiveApplications = 
-        computeMaxActiveApplications(clusterResource, maxAMResourcePercent, 
+        CSQueueUtils.computeMaxActiveApplications(clusterResource, maxAMResourcePercent, 
             absoluteCapacity);
             absoluteCapacity);
     maxActiveApplicationsPerUser = 
     maxActiveApplicationsPerUser = 
-        computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit, 
+        CSQueueUtils.computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit, 
             userLimitFactor);
             userLimitFactor);
     
     
     // Update application properties
     // Update application properties
     for (SchedulerApp application : activeApplications) {
     for (SchedulerApp application : activeApplications) {
-      computeAndSetUserResourceLimit(application, clusterResource);
+      computeAndSetUserResourceLimit(
+          application, clusterResource, Resources.none());
     }
     }
   }
   }
   
   

+ 4 - 10
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java

@@ -118,16 +118,14 @@ public class ParentQueue implements CSQueue {
     }
     }
 
 
     float capacity = (float) rawCapacity / 100;
     float capacity = (float) rawCapacity / 100;
-
     float parentAbsoluteCapacity = 
     float parentAbsoluteCapacity = 
-      (parent == null) ? 1.0f : parent.getAbsoluteCapacity();
+      (rootQueue) ? 1.0f : parent.getAbsoluteCapacity();
     float absoluteCapacity = parentAbsoluteCapacity * capacity; 
     float absoluteCapacity = parentAbsoluteCapacity * capacity; 
 
 
-    float maximumCapacity = 
+    float  maximumCapacity =
       (float) cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
       (float) cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
     float absoluteMaxCapacity = 
     float absoluteMaxCapacity = 
-      (Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ? 
-          Float.MAX_VALUE :  (parentAbsoluteCapacity * maximumCapacity);
+          CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
     
     
     QueueState state = cs.getConfiguration().getState(getQueuePath());
     QueueState state = cs.getConfiguration().getState(getQueuePath());
 
 
@@ -497,12 +495,8 @@ public class ParentQueue implements CSQueue {
     CSQueueUtils.checkMaxCapacity(getQueueName(), capacity, maximumCapacity);
     CSQueueUtils.checkMaxCapacity(getQueueName(), capacity, maximumCapacity);
     
     
     this.maximumCapacity = maximumCapacity;
     this.maximumCapacity = maximumCapacity;
-    float parentAbsoluteCapacity = 
-        (rootQueue) ? 100.0f : parent.getAbsoluteCapacity();
     this.absoluteMaxCapacity = 
     this.absoluteMaxCapacity = 
-      (maximumCapacity == CapacitySchedulerConfiguration.UNDEFINED) ? 
-          Float.MAX_VALUE : 
-          (parentAbsoluteCapacity * maximumCapacity);
+        CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
   }
   }
 
 
   @Override
   @Override

+ 1 - 1
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java

@@ -358,7 +358,7 @@ public class FifoScheduler implements ResourceScheduler {
         }
         }
       }
       }
       
       
-      application.setAvailableResourceLimit(clusterResource);
+      application.setHeadroom(clusterResource);
       
       
       LOG.debug("post-assignContainers");
       LOG.debug("post-assignContainers");
       application.showRequests();
       application.showRequests();

+ 62 - 16
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java

@@ -21,16 +21,24 @@ import static org.junit.Assert.*;
 import static org.mockito.Mockito.*;
 import static org.mockito.Mockito.*;
 
 
 import java.io.IOException;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Map;
 
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.QueueACL;
 import org.apache.hadoop.yarn.api.records.QueueACL;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceRequest;
+import org.apache.hadoop.yarn.factories.RecordFactory;
+import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
+import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
 import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
+import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
 import org.junit.After;
 import org.junit.After;
@@ -283,38 +291,76 @@ public class TestApplicationLimits {
     final String user_0 = "user_0";
     final String user_0 = "user_0";
     final String user_1 = "user_1";
     final String user_1 = "user_1";
     
     
-    int APPLICATION_ID = 0;
+    RecordFactory recordFactory = 
+        RecordFactoryProvider.getRecordFactory(null);
+    RMContext rmContext = TestUtils.getMockRMContext();
+
+    Priority priority_1 = TestUtils.createMockPriority(1);
 
 
-    // Submit first application from user_0, check headroom
-    SchedulerApp app_0_0 = getMockApplication(APPLICATION_ID++, user_0);
+    // Submit first application with some resource-requests from user_0, 
+    // and check headroom
+    final ApplicationAttemptId appAttemptId_0_0 = 
+        TestUtils.getMockApplicationAttemptId(0, 0); 
+    SchedulerApp app_0_0 = 
+        spy(new SchedulerApp(appAttemptId_0_0, user_0, queue, rmContext, null));
     queue.submitApplication(app_0_0, user_0, A);
     queue.submitApplication(app_0_0, user_0, A);
-    queue.assignContainers(clusterResource, node_0); // Schedule to compute
+
+    List<ResourceRequest> app_0_0_requests = new ArrayList<ResourceRequest>();
+    app_0_0_requests.add(
+        TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2, 
+            priority_1, recordFactory));
+    app_0_0.updateResourceRequests(app_0_0_requests);
+
+    // Schedule to compute 
+    queue.assignContainers(clusterResource, node_0);
     Resource expectedHeadroom = Resources.createResource(10*16*GB);
     Resource expectedHeadroom = Resources.createResource(10*16*GB);
-    verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
+    verify(app_0_0).setHeadroom(eq(expectedHeadroom));
 
 
     // Submit second application from user_0, check headroom
     // Submit second application from user_0, check headroom
-    SchedulerApp app_0_1 = getMockApplication(APPLICATION_ID++, user_0);
+    final ApplicationAttemptId appAttemptId_0_1 = 
+        TestUtils.getMockApplicationAttemptId(1, 0); 
+    SchedulerApp app_0_1 = 
+        spy(new SchedulerApp(appAttemptId_0_1, user_0, queue, rmContext, null));
     queue.submitApplication(app_0_1, user_0, A);
     queue.submitApplication(app_0_1, user_0, A);
+    
+    List<ResourceRequest> app_0_1_requests = new ArrayList<ResourceRequest>();
+    app_0_1_requests.add(
+        TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2, 
+            priority_1, recordFactory));
+    app_0_1.updateResourceRequests(app_0_1_requests);
+
+    // Schedule to compute 
     queue.assignContainers(clusterResource, node_0); // Schedule to compute
     queue.assignContainers(clusterResource, node_0); // Schedule to compute
-    verify(app_0_0, times(2)).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));// no change
+    verify(app_0_0, times(2)).setHeadroom(eq(expectedHeadroom));
+    verify(app_0_1).setHeadroom(eq(expectedHeadroom));// no change
     
     
     // Submit first application from user_1, check  for new headroom
     // Submit first application from user_1, check  for new headroom
-    SchedulerApp app_1_0 = getMockApplication(APPLICATION_ID++, user_1);
+    final ApplicationAttemptId appAttemptId_1_0 = 
+        TestUtils.getMockApplicationAttemptId(2, 0); 
+    SchedulerApp app_1_0 = 
+        spy(new SchedulerApp(appAttemptId_1_0, user_1, queue, rmContext, null));
     queue.submitApplication(app_1_0, user_1, A);
     queue.submitApplication(app_1_0, user_1, A);
+
+    List<ResourceRequest> app_1_0_requests = new ArrayList<ResourceRequest>();
+    app_1_0_requests.add(
+        TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2, 
+            priority_1, recordFactory));
+    app_1_0.updateResourceRequests(app_1_0_requests);
+    
+    // Schedule to compute 
     queue.assignContainers(clusterResource, node_0); // Schedule to compute
     queue.assignContainers(clusterResource, node_0); // Schedule to compute
     expectedHeadroom = Resources.createResource(10*16*GB / 2); // changes
     expectedHeadroom = Resources.createResource(10*16*GB / 2); // changes
-    verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_1_0).setAvailableResourceLimit(eq(expectedHeadroom));
-    
+    verify(app_0_0).setHeadroom(eq(expectedHeadroom));
+    verify(app_0_1).setHeadroom(eq(expectedHeadroom));
+    verify(app_1_0).setHeadroom(eq(expectedHeadroom));
+
     // Now reduce cluster size and check for the smaller headroom
     // Now reduce cluster size and check for the smaller headroom
     clusterResource = Resources.createResource(90*16*GB);
     clusterResource = Resources.createResource(90*16*GB);
     queue.assignContainers(clusterResource, node_0); // Schedule to compute
     queue.assignContainers(clusterResource, node_0); // Schedule to compute
     expectedHeadroom = Resources.createResource(9*16*GB / 2); // changes
     expectedHeadroom = Resources.createResource(9*16*GB / 2); // changes
-    verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_1_0).setAvailableResourceLimit(eq(expectedHeadroom));
+    verify(app_0_0).setHeadroom(eq(expectedHeadroom));
+    verify(app_0_1).setHeadroom(eq(expectedHeadroom));
+    verify(app_1_0).setHeadroom(eq(expectedHeadroom));
   }
   }
   
   
 
 

+ 7 - 7
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java

@@ -255,7 +255,7 @@ public class TestLeafQueue {
     // Manipulate queue 'a'
     // Manipulate queue 'a'
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     //unset maxCapacity
     //unset maxCapacity
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
 
 
     // Users
     // Users
     final String user_0 = "user_0";
     final String user_0 = "user_0";
@@ -377,7 +377,7 @@ public class TestLeafQueue {
     // Mock the queue
     // Mock the queue
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     //unset maxCapacity
     //unset maxCapacity
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
     
     
     // Users
     // Users
     final String user_0 = "user_0";
     final String user_0 = "user_0";
@@ -491,7 +491,7 @@ public class TestLeafQueue {
     
     
     // Revert max-capacity and user-limit-factor
     // Revert max-capacity and user-limit-factor
     // Now, allocations should goto app_3 since it's under user-limit 
     // Now, allocations should goto app_3 since it's under user-limit 
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
     a.setUserLimitFactor(1);
     a.setUserLimitFactor(1);
     a.assignContainers(clusterResource, node_0);
     a.assignContainers(clusterResource, node_0);
     assertEquals(7*GB, a.getUsedResources().getMemory()); 
     assertEquals(7*GB, a.getUsedResources().getMemory()); 
@@ -548,7 +548,7 @@ public class TestLeafQueue {
     // Manipulate queue 'a'
     // Manipulate queue 'a'
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     //unset maxCapacity
     //unset maxCapacity
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
 
 
     // Users
     // Users
     final String user_0 = "user_0";
     final String user_0 = "user_0";
@@ -571,7 +571,7 @@ public class TestLeafQueue {
     String host_0 = "host_0";
     String host_0 = "host_0";
     SchedulerNode node_0 = TestUtils.getMockNode(host_0, DEFAULT_RACK, 0, 4*GB);
     SchedulerNode node_0 = TestUtils.getMockNode(host_0, DEFAULT_RACK, 0, 4*GB);
     
     
-    final int numNodes = 1;
+    final int numNodes = 2;
     Resource clusterResource = Resources.createResource(numNodes * (4*GB));
     Resource clusterResource = Resources.createResource(numNodes * (4*GB));
     when(csContext.getNumClusterNodes()).thenReturn(numNodes);
     when(csContext.getNumClusterNodes()).thenReturn(numNodes);
     
     
@@ -646,7 +646,7 @@ public class TestLeafQueue {
     // Manipulate queue 'a'
     // Manipulate queue 'a'
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     //unset maxCapacity
     //unset maxCapacity
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
     a.setUserLimitFactor(10);
     a.setUserLimitFactor(10);
 
 
     // Users
     // Users
@@ -673,7 +673,7 @@ public class TestLeafQueue {
     String host_1 = "host_1";
     String host_1 = "host_1";
     SchedulerNode node_1 = TestUtils.getMockNode(host_1, DEFAULT_RACK, 0, 4*GB);
     SchedulerNode node_1 = TestUtils.getMockNode(host_1, DEFAULT_RACK, 0, 4*GB);
     
     
-    final int numNodes = 2;
+    final int numNodes = 3;
     Resource clusterResource = Resources.createResource(numNodes * (4*GB));
     Resource clusterResource = Resources.createResource(numNodes * (4*GB));
     when(csContext.getNumClusterNodes()).thenReturn(numNodes);
     when(csContext.getNumClusterNodes()).thenReturn(numNodes);
     when(csContext.getMaximumResourceCapability()).thenReturn(
     when(csContext.getMaximumResourceCapability()).thenReturn(

+ 31 - 1
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestQueueParsing.java

@@ -30,6 +30,8 @@ public class TestQueueParsing {
 
 
   private static final Log LOG = LogFactory.getLog(TestQueueParsing.class);
   private static final Log LOG = LogFactory.getLog(TestQueueParsing.class);
   
   
+  private static final double DELTA = 0.000001;
+  
   @Test
   @Test
   public void testQueueParsing() throws Exception {
   public void testQueueParsing() throws Exception {
     CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration();
     CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration();
@@ -37,6 +39,20 @@ public class TestQueueParsing {
 
 
     CapacityScheduler capacityScheduler = new CapacityScheduler();
     CapacityScheduler capacityScheduler = new CapacityScheduler();
     capacityScheduler.reinitialize(conf, null, null);
     capacityScheduler.reinitialize(conf, null, null);
+    
+    CSQueue a = capacityScheduler.getQueue("a");
+    Assert.assertEquals(0.10, a.getAbsoluteCapacity(), DELTA);
+    Assert.assertEquals(0.15, a.getAbsoluteMaximumCapacity(), DELTA);
+    
+    CSQueue b1 = capacityScheduler.getQueue("b1");
+    Assert.assertEquals(0.2 * 0.5, b1.getAbsoluteCapacity(), DELTA);
+    Assert.assertEquals("Parent B has no MAX_CAP", 
+        0.85, b1.getAbsoluteMaximumCapacity(), DELTA);
+    
+    CSQueue c12 = capacityScheduler.getQueue("c12");
+    Assert.assertEquals(0.7 * 0.5 * 0.45, c12.getAbsoluteCapacity(), DELTA);
+    Assert.assertEquals(0.7 * 0.55 * 0.7, 
+        c12.getAbsoluteMaximumCapacity(), DELTA);
   }
   }
   
   
   private void setupQueueConfiguration(CapacitySchedulerConfiguration conf) {
   private void setupQueueConfiguration(CapacitySchedulerConfiguration conf) {
@@ -47,12 +63,14 @@ public class TestQueueParsing {
     
     
     final String A = CapacitySchedulerConfiguration.ROOT + ".a";
     final String A = CapacitySchedulerConfiguration.ROOT + ".a";
     conf.setCapacity(A, 10);
     conf.setCapacity(A, 10);
+    conf.setMaximumCapacity(A, 15);
     
     
     final String B = CapacitySchedulerConfiguration.ROOT + ".b";
     final String B = CapacitySchedulerConfiguration.ROOT + ".b";
     conf.setCapacity(B, 20);
     conf.setCapacity(B, 20);
-
+    
     final String C = CapacitySchedulerConfiguration.ROOT + ".c";
     final String C = CapacitySchedulerConfiguration.ROOT + ".c";
     conf.setCapacity(C, 70);
     conf.setCapacity(C, 70);
+    conf.setMaximumCapacity(C, 70);
 
 
     LOG.info("Setup top-level queues");
     LOG.info("Setup top-level queues");
     
     
@@ -61,15 +79,20 @@ public class TestQueueParsing {
     final String A2 = A + ".a2";
     final String A2 = A + ".a2";
     conf.setQueues(A, new String[] {"a1", "a2"});
     conf.setQueues(A, new String[] {"a1", "a2"});
     conf.setCapacity(A1, 30);
     conf.setCapacity(A1, 30);
+    conf.setMaximumCapacity(A1, 45);
     conf.setCapacity(A2, 70);
     conf.setCapacity(A2, 70);
+    conf.setMaximumCapacity(A2, 85);
     
     
     final String B1 = B + ".b1";
     final String B1 = B + ".b1";
     final String B2 = B + ".b2";
     final String B2 = B + ".b2";
     final String B3 = B + ".b3";
     final String B3 = B + ".b3";
     conf.setQueues(B, new String[] {"b1", "b2", "b3"});
     conf.setQueues(B, new String[] {"b1", "b2", "b3"});
     conf.setCapacity(B1, 50);
     conf.setCapacity(B1, 50);
+    conf.setMaximumCapacity(B1, 85);
     conf.setCapacity(B2, 30);
     conf.setCapacity(B2, 30);
+    conf.setMaximumCapacity(B2, 35);
     conf.setCapacity(B3, 20);
     conf.setCapacity(B3, 20);
+    conf.setMaximumCapacity(B3, 35);
 
 
     final String C1 = C + ".c1";
     final String C1 = C + ".c1";
     final String C2 = C + ".c2";
     final String C2 = C + ".c2";
@@ -77,9 +100,13 @@ public class TestQueueParsing {
     final String C4 = C + ".c4";
     final String C4 = C + ".c4";
     conf.setQueues(C, new String[] {"c1", "c2", "c3", "c4"});
     conf.setQueues(C, new String[] {"c1", "c2", "c3", "c4"});
     conf.setCapacity(C1, 50);
     conf.setCapacity(C1, 50);
+    conf.setMaximumCapacity(C1, 55);
     conf.setCapacity(C2, 10);
     conf.setCapacity(C2, 10);
+    conf.setMaximumCapacity(C2, 25);
     conf.setCapacity(C3, 35);
     conf.setCapacity(C3, 35);
+    conf.setMaximumCapacity(C3, 38);
     conf.setCapacity(C4, 5);
     conf.setCapacity(C4, 5);
+    conf.setMaximumCapacity(C4, 5);
     
     
     LOG.info("Setup 2nd-level queues");
     LOG.info("Setup 2nd-level queues");
     
     
@@ -89,8 +116,11 @@ public class TestQueueParsing {
     final String C13 = C1 + ".c13";
     final String C13 = C1 + ".c13";
     conf.setQueues(C1, new String[] {"c11", "c12", "c13"});
     conf.setQueues(C1, new String[] {"c11", "c12", "c13"});
     conf.setCapacity(C11, 15);
     conf.setCapacity(C11, 15);
+    conf.setMaximumCapacity(C11, 30);
     conf.setCapacity(C12, 45);
     conf.setCapacity(C12, 45);
+    conf.setMaximumCapacity(C12, 70);
     conf.setCapacity(C13, 40);
     conf.setCapacity(C13, 40);
+    conf.setMaximumCapacity(C13, 40);
     
     
     LOG.info("Setup 3rd-level queues");
     LOG.info("Setup 3rd-level queues");
   }
   }

+ 13 - 10
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesCapacitySched.java

@@ -235,12 +235,13 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
         Element qElem = (Element) queues.item(j);
         Element qElem = (Element) queues.item(j);
         String qName = WebServicesTestUtils.getXmlString(qElem, "queueName");
         String qName = WebServicesTestUtils.getXmlString(qElem, "queueName");
         String q = CapacitySchedulerConfiguration.ROOT + "." + qName;
         String q = CapacitySchedulerConfiguration.ROOT + "." + qName;
-        verifySubQueueXML(qElem, q, 100);
+        verifySubQueueXML(qElem, q, 100, 100);
       }
       }
     }
     }
   }
   }
 
 
-  public void verifySubQueueXML(Element qElem, String q, float parentAbsCapacity)
+  public void verifySubQueueXML(Element qElem, String q, 
+      float parentAbsCapacity, float parentAbsMaxCapacity)
       throws Exception {
       throws Exception {
     NodeList queues = qElem.getElementsByTagName("subQueues");
     NodeList queues = qElem.getElementsByTagName("subQueues");
     QueueInfo qi = (queues != null) ? new QueueInfo() : new LeafQueueInfo();
     QueueInfo qi = (queues != null) ? new QueueInfo() : new LeafQueueInfo();
@@ -258,14 +259,15 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
         WebServicesTestUtils.getXmlString(qElem, "usedResources");
         WebServicesTestUtils.getXmlString(qElem, "usedResources");
     qi.queueName = WebServicesTestUtils.getXmlString(qElem, "queueName");
     qi.queueName = WebServicesTestUtils.getXmlString(qElem, "queueName");
     qi.state = WebServicesTestUtils.getXmlString(qElem, "state");
     qi.state = WebServicesTestUtils.getXmlString(qElem, "state");
-    verifySubQueueGeneric(q, qi, parentAbsCapacity);
+    verifySubQueueGeneric(q, qi, parentAbsCapacity, parentAbsMaxCapacity);
 
 
     if (queues != null) {
     if (queues != null) {
       for (int j = 0; j < queues.getLength(); j++) {
       for (int j = 0; j < queues.getLength(); j++) {
         Element subqElem = (Element) queues.item(j);
         Element subqElem = (Element) queues.item(j);
         String qName = WebServicesTestUtils.getXmlString(subqElem, "queueName");
         String qName = WebServicesTestUtils.getXmlString(subqElem, "queueName");
         String q2 = q + "." + qName;
         String q2 = q + "." + qName;
-        verifySubQueueXML(subqElem, q2, qi.absoluteCapacity);
+        verifySubQueueXML(subqElem, q2, 
+            qi.absoluteCapacity, qi.absoluteMaxCapacity);
       }
       }
     } else {
     } else {
       LeafQueueInfo lqi = (LeafQueueInfo) qi;
       LeafQueueInfo lqi = (LeafQueueInfo) qi;
@@ -309,7 +311,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
     for (int i = 0; i < arr.length(); i++) {
     for (int i = 0; i < arr.length(); i++) {
       JSONObject obj = arr.getJSONObject(i);
       JSONObject obj = arr.getJSONObject(i);
       String q = CapacitySchedulerConfiguration.ROOT + "." + obj.getString("queueName");
       String q = CapacitySchedulerConfiguration.ROOT + "." + obj.getString("queueName");
-      verifySubQueue(obj, q, 100);
+      verifySubQueue(obj, q, 100, 100);
     }
     }
   }
   }
 
 
@@ -323,7 +325,8 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
     assertTrue("queueName doesn't match", "root".matches(queueName));
     assertTrue("queueName doesn't match", "root".matches(queueName));
   }
   }
 
 
-  private void verifySubQueue(JSONObject info, String q, float parentAbsCapacity)
+  private void verifySubQueue(JSONObject info, String q, 
+      float parentAbsCapacity, float parentAbsMaxCapacity)
       throws JSONException, Exception {
       throws JSONException, Exception {
     int numExpectedElements = 11;
     int numExpectedElements = 11;
     boolean isParentQueue = true;
     boolean isParentQueue = true;
@@ -345,7 +348,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
     qi.queueName = info.getString("queueName");
     qi.queueName = info.getString("queueName");
     qi.state = info.getString("state");
     qi.state = info.getString("state");
 
 
-    verifySubQueueGeneric(q, qi, parentAbsCapacity);
+    verifySubQueueGeneric(q, qi, parentAbsCapacity, parentAbsMaxCapacity);
 
 
     if (isParentQueue) {
     if (isParentQueue) {
       JSONArray arr = info.getJSONArray("subQueues");
       JSONArray arr = info.getJSONArray("subQueues");
@@ -353,7 +356,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
       for (int i = 0; i < arr.length(); i++) {
       for (int i = 0; i < arr.length(); i++) {
         JSONObject obj = arr.getJSONObject(i);
         JSONObject obj = arr.getJSONObject(i);
         String q2 = q + "." + obj.getString("queueName");
         String q2 = q + "." + obj.getString("queueName");
-        verifySubQueue(obj, q2, qi.absoluteCapacity);
+        verifySubQueue(obj, q2, qi.absoluteCapacity, qi.absoluteMaxCapacity);
       }
       }
     } else {
     } else {
       LeafQueueInfo lqi = (LeafQueueInfo) qi;
       LeafQueueInfo lqi = (LeafQueueInfo) qi;
@@ -371,7 +374,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
   }
   }
 
 
   private void verifySubQueueGeneric(String q, QueueInfo info,
   private void verifySubQueueGeneric(String q, QueueInfo info,
-      float parentAbsCapacity) throws Exception {
+      float parentAbsCapacity, float parentAbsMaxCapacity) throws Exception {
     String[] qArr = q.split("\\.");
     String[] qArr = q.split("\\.");
     assertTrue("q name invalid: " + q, qArr.length > 1);
     assertTrue("q name invalid: " + q, qArr.length > 1);
     String qshortName = qArr[qArr.length - 1];
     String qshortName = qArr[qArr.length - 1];
@@ -380,7 +383,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
     assertEquals("capacity doesn't match", csConf.getCapacity(q),
     assertEquals("capacity doesn't match", csConf.getCapacity(q),
         info.capacity, 1e-3f);
         info.capacity, 1e-3f);
     float expectCapacity = csConf.getMaximumCapacity(q);
     float expectCapacity = csConf.getMaximumCapacity(q);
-    float expectAbsMaxCapacity = parentAbsCapacity * (info.maxCapacity/100);
+    float expectAbsMaxCapacity = parentAbsMaxCapacity * (info.maxCapacity/100);
     if (CapacitySchedulerConfiguration.UNDEFINED == expectCapacity) {
     if (CapacitySchedulerConfiguration.UNDEFINED == expectCapacity) {
       expectCapacity = 100;
       expectCapacity = 100;
       expectAbsMaxCapacity = 100;
       expectAbsMaxCapacity = 100;

+ 18 - 0
hadoop-project/pom.xml

@@ -714,11 +714,21 @@
           <artifactId>maven-project-info-reports-plugin</artifactId>
           <artifactId>maven-project-info-reports-plugin</artifactId>
           <version>2.4</version>
           <version>2.4</version>
         </plugin>
         </plugin>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-resources-plugin</artifactId>
+          <version>2.2</version>
+        </plugin>
         <plugin>
         <plugin>
           <groupId>org.codehaus.mojo</groupId>
           <groupId>org.codehaus.mojo</groupId>
           <artifactId>exec-maven-plugin</artifactId>
           <artifactId>exec-maven-plugin</artifactId>
           <version>1.2</version>
           <version>1.2</version>
         </plugin>
         </plugin>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-pdf-plugin</artifactId>
+          <version>1.1</version>
+        </plugin>
       </plugins>
       </plugins>
     </pluginManagement>
     </pluginManagement>
 
 
@@ -778,6 +788,14 @@
           </excludes>
           </excludes>
         </configuration>
         </configuration>
       </plugin>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-pdf-plugin</artifactId>
+        <configuration>
+          <outputDirectory>${project.reporting.outputDirectory}</outputDirectory>
+          <includeReports>false</includeReports>
+        </configuration>
+      </plugin>
     </plugins>
     </plugins>
   </build>
   </build>
 
 

+ 7 - 0
hadoop-tools/hadoop-distcp/README

@@ -0,0 +1,7 @@
+DistCp (distributed copy) is a tool used for large inter/intra-cluster copying. 
+It uses Map/Reduce to effect its distribution, error handling and recovery, 
+and reporting. It expands a list of files and directories into input to map tasks, 
+each of which will copy a partition of the files specified in the source list.
+
+Version 0.1 (2010/08/02 sriksun)
+ - Initial Version

+ 198 - 0
hadoop-tools/hadoop-distcp/pom.xml

@@ -0,0 +1,198 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<project>
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.hadoop</groupId>
+    <artifactId>hadoop-project</artifactId>
+    <version>0.24.0-SNAPSHOT</version>
+    <relativePath>../../hadoop-project</relativePath>
+  </parent>
+  <groupId>org.apache.hadoop.tools</groupId>
+  <artifactId>hadoop-distcp</artifactId>
+  <version>0.24.0-SNAPSHOT</version>
+  <description>Apache Hadoop Distributed Copy</description>
+  <name>Apache Hadoop Distributed Copy</name>
+  <packaging>jar</packaging>
+
+  <properties>
+    <file.encoding>UTF-8</file.encoding>
+    <downloadSources>true</downloadSources>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-annotations</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-app</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-hs</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-core</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <resources>
+      <resource>
+        <directory>src/main/resources</directory>
+        <filtering>true</filtering>
+      </resource>
+    </resources>
+    <testResources>
+      <testResource>
+        <directory>src/test/resources</directory>
+        <filtering>true</filtering>
+      </testResource>
+    </testResources>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <configuration>
+          <forkMode>always</forkMode>
+          <forkedProcessTimeoutInSeconds>600</forkedProcessTimeoutInSeconds>
+          <argLine>-Xmx1024m</argLine>
+          <includes>
+            <include>**/Test*.java</include>
+          </includes>
+          <redirectTestOutputToFile>true</redirectTestOutputToFile>
+          <systemProperties>
+            <property>
+              <name>test.build.data</name>
+              <value>${basedir}/target/test/data</value>
+            </property>
+            <property>
+              <name>hadoop.log.dir</name>
+              <value>target/test/logs</value>
+            </property>
+            <property>
+              <name>org.apache.commons.logging.Log</name>
+              <value>org.apache.commons.logging.impl.SimpleLog</value>
+            </property>
+            <property>
+              <name>org.apache.commons.logging.simplelog.defaultlog</name>
+              <value>warn</value>
+            </property>
+          </systemProperties>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>copy-dependencies</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}/lib</outputDirectory>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-checkstyle-plugin</artifactId>
+        <configuration>
+          <enableRulesSummary>true</enableRulesSummary>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifest>
+              <mainClass>org.apache.hadoop.tools.DistCp</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-source-plugin</artifactId>
+        <configuration>
+          <attach>true</attach>
+        </configuration>
+        <executions>
+          <execution>
+            <goals>
+              <goal>jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-pdf-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>pdf</id>
+            <phase>package</phase>
+            <goals>
+              <goal>pdf</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>

+ 218 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyListing.java

@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.security.Credentials;
+
+import java.io.IOException;
+
+/**
+ * The CopyListing abstraction is responsible for how the list of
+ * sources and targets is constructed, for DistCp's copy function.
+ * The copy-listing should be a SequenceFile<Text, FileStatus>,
+ * located at the path specified to buildListing(),
+ * each entry being a pair of (Source relative path, source file status),
+ * all the paths being fully qualified.
+ */
+public abstract class CopyListing extends Configured {
+
+  private Credentials credentials;
+
+  /**
+   * Build listing function creates the input listing that distcp uses to
+   * perform the copy.
+   *
+   * The build listing is a sequence file that has relative path of a file in the key
+   * and the file status information of the source file in the value
+   *
+   * For instance if the source path is /tmp/data and the traversed path is
+   * /tmp/data/dir1/dir2/file1, then the sequence file would contain
+   *
+   * key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
+   *
+   * File would also contain directory entries. Meaning, if /tmp/data/dir1/dir2/file1
+   * is the only file under /tmp/data, the resulting sequence file would contain the
+   * following entries
+   *
+   * key: /dir1 and value: FileStatus(/tmp/data/dir1)
+   * key: /dir1/dir2 and value: FileStatus(/tmp/data/dir1/dir2)
+   * key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
+   *
+   * Cases requiring special handling:
+   * If source path is a file (/tmp/file1), contents of the file will be as follows
+   *
+   * TARGET DOES NOT EXIST: Key-"", Value-FileStatus(/tmp/file1)
+   * TARGET IS FILE       : Key-"", Value-FileStatus(/tmp/file1)
+   * TARGET IS DIR        : Key-"/file1", Value-FileStatus(/tmp/file1)  
+   *
+   * @param pathToListFile - Output file where the listing would be stored
+   * @param options - Input options to distcp
+   * @throws IOException - Exception if any
+   */
+  public final void buildListing(Path pathToListFile,
+                                 DistCpOptions options) throws IOException {
+    validatePaths(options);
+    doBuildListing(pathToListFile, options);
+    Configuration config = getConf();
+
+    config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, pathToListFile.toString());
+    config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, getBytesToCopy());
+    config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, getNumberOfPaths());
+
+    checkForDuplicates(pathToListFile);
+  }
+
+  /**
+   * Validate input and output paths
+   *
+   * @param options - Input options
+   * @throws InvalidInputException: If inputs are invalid
+   * @throws IOException: any Exception with FS 
+   */
+  protected abstract void validatePaths(DistCpOptions options)
+      throws IOException, InvalidInputException;
+
+  /**
+   * The interface to be implemented by sub-classes, to create the source/target file listing.
+   * @param pathToListFile Path on HDFS where the listing file is written.
+   * @param options Input Options for DistCp (indicating source/target paths.)
+   * @throws IOException: Thrown on failure to create the listing file.
+   */
+  protected abstract void doBuildListing(Path pathToListFile,
+                                         DistCpOptions options) throws IOException;
+
+  /**
+   * Return the total bytes that distCp should copy for the source paths
+   * This doesn't consider whether file is same should be skipped during copy
+   *
+   * @return total bytes to copy
+   */
+  protected abstract long getBytesToCopy();
+
+  /**
+   * Return the total number of paths to distcp, includes directories as well
+   * This doesn't consider whether file/dir is already present and should be skipped during copy
+   *
+   * @return Total number of paths to distcp
+   */
+  protected abstract long getNumberOfPaths();
+
+  /**
+   * Validate the final resulting path listing to see if there are any duplicate entries
+   *
+   * @param pathToListFile - path listing build by doBuildListing
+   * @throws IOException - Any issues while checking for duplicates and throws
+   * @throws DuplicateFileException - if there are duplicates
+   */
+  private void checkForDuplicates(Path pathToListFile)
+      throws DuplicateFileException, IOException {
+
+    Configuration config = getConf();
+    FileSystem fs = pathToListFile.getFileSystem(config);
+
+    Path sortedList = DistCpUtils.sortListing(fs, config, pathToListFile);
+
+    SequenceFile.Reader reader = new SequenceFile.Reader(
+                          config, SequenceFile.Reader.file(sortedList));
+    try {
+      Text lastKey = new Text("*"); //source relative path can never hold *
+      FileStatus lastFileStatus = new FileStatus();
+
+      Text currentKey = new Text();
+      while (reader.next(currentKey)) {
+        if (currentKey.equals(lastKey)) {
+          FileStatus currentFileStatus = new FileStatus();
+          reader.getCurrentValue(currentFileStatus);
+          throw new DuplicateFileException("File " + lastFileStatus.getPath() + " and " +
+              currentFileStatus.getPath() + " would cause duplicates. Aborting");
+        }
+        reader.getCurrentValue(lastFileStatus);
+        lastKey.set(currentKey);
+      }
+    } finally {
+      IOUtils.closeStream(reader);
+    }
+  }
+
+  /**
+   * Protected constructor, to initialize configuration.
+   * @param configuration The input configuration,
+   *                        with which the source/target FileSystems may be accessed.
+   * @param credentials - Credentials object on which the FS delegation tokens are cached.If null
+   * delegation token caching is skipped
+   */
+  protected CopyListing(Configuration configuration, Credentials credentials) {
+    setConf(configuration);
+    setCredentials(credentials);
+  }
+
+  /**
+   * set Credentials store, on which FS delegatin token will be cached
+   * @param credentials - Credentials object
+   */
+  protected void setCredentials(Credentials credentials) {
+    this.credentials = credentials;
+  }
+
+  /**
+   * get credentials to update the delegation tokens for accessed FS objects
+   * @return Credentials object
+   */
+  protected Credentials getCredentials() {
+    return credentials;
+  }
+
+  /**
+   * Public Factory method with which the appropriate CopyListing implementation may be retrieved.
+   * @param configuration The input configuration.
+   * @param credentials Credentials object on which the FS delegation tokens are cached
+   * @param options The input Options, to help choose the appropriate CopyListing Implementation.
+   * @return An instance of the appropriate CopyListing implementation.
+   */
+  public static CopyListing getCopyListing(Configuration configuration,
+                                           Credentials credentials,
+                                           DistCpOptions options) {
+    if (options.getSourceFileListing() == null) {
+      return new GlobbedCopyListing(configuration, credentials);
+    } else {
+      return new FileBasedCopyListing(configuration, credentials);
+    }
+  }
+
+  static class DuplicateFileException extends RuntimeException {
+    public DuplicateFileException(String message) {
+      super(message);
+    }
+  }
+
+  static class InvalidInputException extends RuntimeException {
+    public InvalidInputException(String message) {
+      super(message);
+    }
+  }
+}

+ 405 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCp.java

@@ -0,0 +1,405 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobSubmissionFiles;
+import org.apache.hadoop.mapreduce.Cluster;
+import org.apache.hadoop.tools.CopyListing.*;
+import org.apache.hadoop.tools.mapred.CopyMapper;
+import org.apache.hadoop.tools.mapred.CopyOutputFormat;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import java.io.IOException;
+import java.util.Random;
+
+/**
+ * DistCp is the main driver-class for DistCpV2.
+ * For command-line use, DistCp::main() orchestrates the parsing of command-line
+ * parameters and the launch of the DistCp job.
+ * For programmatic use, a DistCp object can be constructed by specifying
+ * options (in a DistCpOptions object), and DistCp::execute() may be used to
+ * launch the copy-job. DistCp may alternatively be sub-classed to fine-tune
+ * behaviour.
+ */
+public class DistCp extends Configured implements Tool {
+  private static final Log LOG = LogFactory.getLog(DistCp.class);
+
+  private DistCpOptions inputOptions;
+  private Path metaFolder;
+
+  private static final String PREFIX = "_distcp";
+  private static final String WIP_PREFIX = "._WIP_";
+  private static final String DISTCP_DEFAULT_XML = "distcp-default.xml";
+  public static final Random rand = new Random();
+
+  private boolean submitted;
+  private FileSystem jobFS;
+
+  /**
+   * Public Constructor. Creates DistCp object with specified input-parameters.
+   * (E.g. source-paths, target-location, etc.)
+   * @param inputOptions Options (indicating source-paths, target-location.)
+   * @param configuration The Hadoop configuration against which the Copy-mapper must run.
+   * @throws Exception, on failure.
+   */
+  public DistCp(Configuration configuration, DistCpOptions inputOptions) throws Exception {
+    Configuration config = new Configuration(configuration);
+    config.addResource(DISTCP_DEFAULT_XML);
+    setConf(config);
+    this.inputOptions = inputOptions;
+    this.metaFolder   = createMetaFolderPath();
+  }
+
+  /**
+   * To be used with the ToolRunner. Not for public consumption.
+   */
+  private DistCp() {}
+
+  /**
+   * Implementation of Tool::run(). Orchestrates the copy of source file(s)
+   * to target location, by:
+   *  1. Creating a list of files to be copied to target.
+   *  2. Launching a Map-only job to copy the files. (Delegates to execute().)
+   * @param argv List of arguments passed to DistCp, from the ToolRunner.
+   * @return On success, it returns 0. Else, -1.
+   */
+  public int run(String[] argv) {
+    try {
+      inputOptions = (OptionsParser.parse(argv));
+
+      LOG.info("Input Options: " + inputOptions);
+    } catch (Throwable e) {
+      LOG.error("Invalid arguments: ", e);
+      System.err.println("Invalid arguments: " + e.getMessage());
+      OptionsParser.usage();      
+      return DistCpConstants.INVALID_ARGUMENT;
+    }
+    
+    try {
+      execute();
+    } catch (InvalidInputException e) {
+      LOG.error("Invalid input: ", e);
+      return DistCpConstants.INVALID_ARGUMENT;
+    } catch (DuplicateFileException e) {
+      LOG.error("Duplicate files in input path: ", e);
+      return DistCpConstants.DUPLICATE_INPUT;
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      return DistCpConstants.UNKNOWN_ERROR;
+    }
+    return DistCpConstants.SUCCESS;
+  }
+
+  /**
+   * Implements the core-execution. Creates the file-list for copy,
+   * and launches the Hadoop-job, to do the copy.
+   * @return Job handle
+   * @throws Exception, on failure.
+   */
+  public Job execute() throws Exception {
+    assert inputOptions != null;
+    assert getConf() != null;
+
+    Job job = null;
+    try {
+      metaFolder = createMetaFolderPath();
+      jobFS = metaFolder.getFileSystem(getConf());
+
+      job = createJob();
+      createInputFileListing(job);
+
+      job.submit();
+      submitted = true;
+    } finally {
+      if (!submitted) {
+        cleanup();
+      }
+    }
+
+    String jobID = job.getJobID().toString();
+    job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID);
+    
+    LOG.info("DistCp job-id: " + jobID);
+    if (inputOptions.shouldBlock()) {
+      job.waitForCompletion(true);
+    }
+    return job;
+  }
+
+  /**
+   * Create Job object for submitting it, with all the configuration
+   *
+   * @return Reference to job object.
+   * @throws IOException - Exception if any
+   */
+  private Job createJob() throws IOException {
+    String jobName = "distcp";
+    String userChosenName = getConf().get(JobContext.JOB_NAME);
+    if (userChosenName != null)
+      jobName += ": " + userChosenName;
+    Job job = Job.getInstance(getConf());
+    job.setJobName(jobName);
+    job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions));
+    job.setJarByClass(CopyMapper.class);
+    configureOutputFormat(job);
+
+    job.setMapperClass(CopyMapper.class);
+    job.setNumReduceTasks(0);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(Text.class);
+    job.setOutputFormatClass(CopyOutputFormat.class);
+    job.getConfiguration().set(JobContext.MAP_SPECULATIVE, "false");
+    job.getConfiguration().set(JobContext.NUM_MAPS,
+                  String.valueOf(inputOptions.getMaxMaps()));
+
+    if (inputOptions.getSslConfigurationFile() != null) {
+      setupSSLConfig(job);
+    }
+
+    inputOptions.appendToConf(job.getConfiguration());
+    return job;
+  }
+
+  /**
+   * Setup ssl configuration on the job configuration to enable hsftp access
+   * from map job. Also copy the ssl configuration file to Distributed cache
+   *
+   * @param job - Reference to job's handle
+   * @throws java.io.IOException - Exception if unable to locate ssl config file
+   */
+  private void setupSSLConfig(Job job) throws IOException  {
+    Configuration configuration = job.getConfiguration();
+    Path sslConfigPath = new Path(configuration.
+        getResource(inputOptions.getSslConfigurationFile()).toString());
+
+    addSSLFilesToDistCache(job, sslConfigPath);
+    configuration.set(DistCpConstants.CONF_LABEL_SSL_CONF, sslConfigPath.getName());
+    configuration.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfigPath.getName());
+  }
+
+  /**
+   * Add SSL files to distributed cache. Trust store, key store and ssl config xml
+   *
+   * @param job - Job handle
+   * @param sslConfigPath - ssl Configuration file specified through options
+   * @throws IOException - If any
+   */
+  private void addSSLFilesToDistCache(Job job,
+                                      Path sslConfigPath) throws IOException {
+    Configuration configuration = job.getConfiguration();
+    FileSystem localFS = FileSystem.getLocal(configuration);
+
+    Configuration sslConf = new Configuration(false);
+    sslConf.addResource(sslConfigPath);
+
+    Path localStorePath = getLocalStorePath(sslConf,
+                            DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION);
+    job.addCacheFile(localStorePath.makeQualified(localFS.getUri(),
+                                      localFS.getWorkingDirectory()).toUri());
+    configuration.set(DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION,
+                      localStorePath.getName());
+
+    localStorePath = getLocalStorePath(sslConf,
+                             DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION);
+    job.addCacheFile(localStorePath.makeQualified(localFS.getUri(),
+                                      localFS.getWorkingDirectory()).toUri());
+    configuration.set(DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION,
+                                      localStorePath.getName());
+
+    job.addCacheFile(sslConfigPath.makeQualified(localFS.getUri(),
+                                      localFS.getWorkingDirectory()).toUri());
+
+  }
+
+  /**
+   * Get Local Trust store/key store path
+   *
+   * @param sslConf - Config from SSL Client xml
+   * @param storeKey - Key for either trust store or key store
+   * @return - Path where the store is present
+   * @throws IOException -If any
+   */
+  private Path getLocalStorePath(Configuration sslConf, String storeKey) throws IOException {
+    if (sslConf.get(storeKey) != null) {
+      return new Path(sslConf.get(storeKey));
+    } else {
+      throw new IOException("Store for " + storeKey + " is not set in " +
+          inputOptions.getSslConfigurationFile());
+    }
+  }
+
+  /**
+   * Setup output format appropriately
+   *
+   * @param job - Job handle
+   * @throws IOException - Exception if any
+   */
+  private void configureOutputFormat(Job job) throws IOException {
+    final Configuration configuration = job.getConfiguration();
+    Path targetPath = inputOptions.getTargetPath();
+    FileSystem targetFS = targetPath.getFileSystem(configuration);
+    targetPath = targetPath.makeQualified(targetFS.getUri(),
+                                          targetFS.getWorkingDirectory());
+
+    if (inputOptions.shouldAtomicCommit()) {
+      Path workDir = inputOptions.getAtomicWorkPath();
+      if (workDir == null) {
+        workDir = targetPath.getParent();
+      }
+      workDir = new Path(workDir, WIP_PREFIX + targetPath.getName()
+                                + rand.nextInt());
+      FileSystem workFS = workDir.getFileSystem(configuration);
+      if (!DistCpUtils.compareFs(targetFS, workFS)) {
+        throw new IllegalArgumentException("Work path " + workDir +
+            " and target path " + targetPath + " are in different file system");
+      }
+      CopyOutputFormat.setWorkingDirectory(job, workDir);
+    } else {
+      CopyOutputFormat.setWorkingDirectory(job, targetPath);
+    }
+    CopyOutputFormat.setCommitDirectory(job, targetPath);
+
+    Path logPath = inputOptions.getLogPath();
+    if (logPath == null) {
+      logPath = new Path(metaFolder, "_logs");
+    } else {
+      LOG.info("DistCp job log path: " + logPath);
+    }
+    CopyOutputFormat.setOutputPath(job, logPath);
+  }
+
+  /**
+   * Create input listing by invoking an appropriate copy listing
+   * implementation. Also add delegation tokens for each path
+   * to job's credential store
+   *
+   * @param job - Handle to job
+   * @return Returns the path where the copy listing is created
+   * @throws IOException - If any
+   */
+  private Path createInputFileListing(Job job) throws IOException {
+    Path fileListingPath = getFileListingPath();
+    CopyListing copyListing = CopyListing.getCopyListing(job.getConfiguration(),
+        job.getCredentials(), inputOptions);
+    copyListing.buildListing(fileListingPath, inputOptions);
+    return fileListingPath;
+  }
+
+  /**
+   * Get default name of the copy listing file. Use the meta folder
+   * to create the copy listing file
+   *
+   * @return - Path where the copy listing file has to be saved
+   * @throws IOException - Exception if any
+   */
+  private Path getFileListingPath() throws IOException {
+    String fileListPathStr = metaFolder + "/fileList.seq";
+    Path path = new Path(fileListPathStr);
+    return new Path(path.toUri().normalize().toString());
+  }
+
+  /**
+   * Create a default working folder for the job, under the
+   * job staging directory
+   *
+   * @return Returns the working folder information
+   * @throws Exception - EXception if any
+   */
+  private Path createMetaFolderPath() throws Exception {
+    Configuration configuration = getConf();
+    Path stagingDir = JobSubmissionFiles.getStagingDir(
+            new Cluster(configuration), configuration);
+    Path metaFolderPath = new Path(stagingDir, PREFIX + String.valueOf(rand.nextInt()));
+    if (LOG.isDebugEnabled())
+      LOG.debug("Meta folder location: " + metaFolderPath);
+    configuration.set(DistCpConstants.CONF_LABEL_META_FOLDER, metaFolderPath.toString());    
+    return metaFolderPath;
+  }
+
+  /**
+   * Main function of the DistCp program. Parses the input arguments (via OptionsParser),
+   * and invokes the DistCp::run() method, via the ToolRunner.
+   * @param argv Command-line arguments sent to DistCp.
+   */
+  public static void main(String argv[]) {
+    try {
+      DistCp distCp = new DistCp();
+      Cleanup CLEANUP = new Cleanup(distCp);
+
+      Runtime.getRuntime().addShutdownHook(CLEANUP);
+      System.exit(ToolRunner.run(getDefaultConf(), distCp, argv));
+    }
+    catch (Exception e) {
+      LOG.error("Couldn't complete DistCp operation: ", e);
+      System.exit(DistCpConstants.UNKNOWN_ERROR);
+    }
+  }
+
+  /**
+   * Loads properties from distcp-default.xml into configuration
+   * object
+   * @return Configuration which includes properties from distcp-default.xml
+   */
+  private static Configuration getDefaultConf() {
+    Configuration config = new Configuration();
+    config.addResource(DISTCP_DEFAULT_XML);
+    return config;
+  }
+
+  private synchronized void cleanup() {
+    try {
+      if (metaFolder == null) return;
+
+      jobFS.delete(metaFolder, true);
+      metaFolder = null;
+    } catch (IOException e) {
+      LOG.error("Unable to cleanup meta folder: " + metaFolder, e);
+    }
+  }
+
+  private boolean isSubmitted() {
+    return submitted;
+  }
+
+  private static class Cleanup extends Thread {
+    private final DistCp distCp;
+
+    public Cleanup(DistCp distCp) {
+      this.distCp = distCp;
+    }
+
+    @Override
+    public void run() {
+      if (distCp.isSubmitted()) return;
+
+      distCp.cleanup();
+    }
+  }
+}

+ 104 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java

@@ -0,0 +1,104 @@
+package org.apache.hadoop.tools;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Utility class to hold commonly used constants.
+ */
+public class DistCpConstants {
+
+  /* Default number of maps to use for DistCp */
+  public static final int DEFAULT_MAPS = 20;
+
+  /* Default bandwidth if none specified */
+  public static final int DEFAULT_BANDWIDTH_MB = 100;
+
+  /* Default strategy for copying. Implementation looked up
+     from distcp-default.xml
+   */
+  public static final String UNIFORMSIZE = "uniformsize";
+
+  /**
+   *  Constants mapping to command line switches/input options
+   */
+  public static final String CONF_LABEL_ATOMIC_COPY = "distcp.atomic.copy";
+  public static final String CONF_LABEL_WORK_PATH = "distcp.work.path";
+  public static final String CONF_LABEL_LOG_PATH = "distcp.log.path";
+  public static final String CONF_LABEL_IGNORE_FAILURES = "distcp.ignore.failures";
+  public static final String CONF_LABEL_PRESERVE_STATUS = "distcp.preserve.status";
+  public static final String CONF_LABEL_SYNC_FOLDERS = "distcp.sync.folders";
+  public static final String CONF_LABEL_DELETE_MISSING = "distcp.delete.missing.source";
+  public static final String CONF_LABEL_SSL_CONF = "distcp.keystore.resource";
+  public static final String CONF_LABEL_MAX_MAPS = "distcp.max.maps";
+  public static final String CONF_LABEL_SOURCE_LISTING = "distcp.source.listing";
+  public static final String CONF_LABEL_COPY_STRATEGY = "distcp.copy.strategy";
+  public static final String CONF_LABEL_SKIP_CRC = "distcp.skip.crc";
+  public static final String CONF_LABEL_OVERWRITE = "distcp.copy.overwrite";
+  public static final String CONF_LABEL_BANDWIDTH_MB = "distcp.map.bandwidth.mb";
+
+  /* Total bytes to be copied. Updated by copylisting. Unfiltered count */
+  public static final String CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED = "mapred.total.bytes.expected";
+
+  /* Total number of paths to copy, includes directories. Unfiltered count */
+  public static final String CONF_LABEL_TOTAL_NUMBER_OF_RECORDS = "mapred.number.of.records";
+
+  /* SSL keystore resource */
+  public static final String CONF_LABEL_SSL_KEYSTORE = "dfs.https.client.keystore.resource";
+
+  /* If input is based -f <<source listing>>, file containing the src paths */
+  public static final String CONF_LABEL_LISTING_FILE_PATH = "distcp.listing.file.path";
+
+  /* Directory where the mapreduce job will write to. If not atomic commit, then same
+    as CONF_LABEL_TARGET_FINAL_PATH
+   */
+  public static final String CONF_LABEL_TARGET_WORK_PATH = "distcp.target.work.path";
+
+  /* Directory where the final data will be committed to. If not atomic commit, then same
+    as CONF_LABEL_TARGET_WORK_PATH
+   */
+  public static final String CONF_LABEL_TARGET_FINAL_PATH = "distcp.target.final.path";
+
+  /**
+   * DistCp job id for consumers of the Disctp 
+   */
+  public static final String CONF_LABEL_DISTCP_JOB_ID = "distcp.job.id";
+
+  /* Meta folder where the job's intermediate data is kept */
+  public static final String CONF_LABEL_META_FOLDER = "distcp.meta.folder";
+
+  /**
+   * Conf label for SSL Trust-store location.
+   */
+  public static final String CONF_LABEL_SSL_TRUST_STORE_LOCATION
+      = "ssl.client.truststore.location";
+
+  /**
+   * Conf label for SSL Key-store location.
+   */
+  public static final String CONF_LABEL_SSL_KEY_STORE_LOCATION
+      = "ssl.client.keystore.location";
+
+  /**
+   * Constants for DistCp return code to shell / consumer of ToolRunner's run
+   */
+  public static final int SUCCESS = 0;
+  public static final int INVALID_ARGUMENT = -1;
+  public static final int DUPLICATE_INPUT = -2;
+  public static final int UNKNOWN_ERROR = -999;
+}

+ 218 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java

@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.cli.Option;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Enumeration mapping configuration keys to distcp command line
+ * options.
+ */
+public enum DistCpOptionSwitch {
+
+  /**
+   * Ignores any failures during copy, and continues with rest.
+   * Logs failures in a file
+   */
+  IGNORE_FAILURES(DistCpConstants.CONF_LABEL_IGNORE_FAILURES,
+      new Option("i", false, "Ignore failures during copy")),
+
+  /**
+   * Preserves status of file/path in the target.
+   * Default behavior with -p, is to preserve replication,
+   * block size, user, group and permission on the target file
+   *
+   * If any of the optional switches are present among rbugp, then
+   * only the corresponding file attribute is preserved
+   *
+   */
+  PRESERVE_STATUS(DistCpConstants.CONF_LABEL_PRESERVE_STATUS,
+      new Option("p", true, "preserve status (rbugp)" +
+          "(replication, block-size, user, group, permission)")),
+
+  /**
+   * Update target location by copying only files that are missing
+   * in the target. This can be used to periodically sync two folders
+   * across source and target. Typically used with DELETE_MISSING
+   * Incompatible with ATOMIC_COMMIT
+   */
+  SYNC_FOLDERS(DistCpConstants.CONF_LABEL_SYNC_FOLDERS, 
+      new Option("update", false, "Update target, copying only missing" +
+          "files or directories")),
+
+  /**
+   * Deletes missing files in target that are missing from source
+   * This allows the target to be in sync with the source contents
+   * Typically used in conjunction with SYNC_FOLDERS
+   * Incompatible with ATOMIC_COMMIT
+   */
+  DELETE_MISSING(DistCpConstants.CONF_LABEL_DELETE_MISSING,
+      new Option("delete", false, "Delete from target, " +
+          "files missing in source")),
+
+  /**
+   * Configuration file to use with hftps:// for securely copying
+   * files across clusters. Typically the configuration file contains
+   * truststore/keystore information such as location, password and type
+   */
+  SSL_CONF(DistCpConstants.CONF_LABEL_SSL_CONF,
+      new Option("mapredSslConf", true, "Configuration for ssl config file" +
+          ", to use with hftps://")),
+
+  /**
+   * Max number of maps to use during copy. DistCp will split work
+   * as equally as possible among these maps
+   */
+  MAX_MAPS(DistCpConstants.CONF_LABEL_MAX_MAPS, 
+      new Option("m", true, "Max number of concurrent maps to use for copy")),
+
+  /**
+   * Source file listing can be provided to DistCp in a file.
+   * This allows DistCp to copy random list of files from source
+   * and copy them to target
+   */
+  SOURCE_FILE_LISTING(DistCpConstants.CONF_LABEL_SOURCE_LISTING,
+      new Option("f", true, "List of files that need to be copied")),
+
+  /**
+   * Copy all the source files and commit them atomically to the target
+   * This is typically useful in cases where there is a process
+   * polling for availability of a file/dir. This option is incompatible
+   * with SYNC_FOLDERS & DELETE_MISSING
+   */
+  ATOMIC_COMMIT(DistCpConstants.CONF_LABEL_ATOMIC_COPY,
+      new Option("atomic", false, "Commit all changes or none")),
+
+  /**
+   * Work path to be used only in conjunction in Atomic commit
+   */
+  WORK_PATH(DistCpConstants.CONF_LABEL_WORK_PATH,
+      new Option("tmp", true, "Intermediate work path to be used for atomic commit")),
+
+  /**
+   * Log path where distcp output logs are written to
+   */
+  LOG_PATH(DistCpConstants.CONF_LABEL_LOG_PATH,
+      new Option("log", true, "Folder on DFS where distcp execution logs are saved")),
+
+  /**
+   * Copy strategy is use. This could be dynamic or uniform size etc.
+   * DistCp would use an appropriate input format based on this.
+   */
+  COPY_STRATEGY(DistCpConstants.CONF_LABEL_COPY_STRATEGY,
+      new Option("strategy", true, "Copy strategy to use. Default is " +
+          "dividing work based on file sizes")),
+
+  /**
+   * Skip CRC checks between source and target, when determining what
+   * files need to be copied.
+   */
+  SKIP_CRC(DistCpConstants.CONF_LABEL_SKIP_CRC,
+      new Option("skipcrccheck", false, "Whether to skip CRC checks between " +
+          "source and target paths.")),
+
+  /**
+   * Overwrite target-files unconditionally.
+   */
+  OVERWRITE(DistCpConstants.CONF_LABEL_OVERWRITE,
+      new Option("overwrite", false, "Choose to overwrite target files " +
+          "unconditionally, even if they exist.")),
+
+  /**
+   * Should DisctpExecution be blocking
+   */
+  BLOCKING("",
+      new Option("async", false, "Should distcp execution be blocking")),
+
+  FILE_LIMIT("",
+      new Option("filelimit", true, "(Deprecated!) Limit number of files " +
+              "copied to <= n")),
+
+  SIZE_LIMIT("",
+      new Option("sizelimit", true, "(Deprecated!) Limit number of files " +
+              "copied to <= n bytes")),
+
+  /**
+   * Specify bandwidth per map in MB
+   */
+  BANDWIDTH(DistCpConstants.CONF_LABEL_BANDWIDTH_MB,
+      new Option("bandwidth", true, "Specify bandwidth per map in MB"));
+
+  private final String confLabel;
+  private final Option option;
+
+  DistCpOptionSwitch(String confLabel, Option option) {
+    this.confLabel = confLabel;
+    this.option = option;
+  }
+
+  /**
+   * Get Configuration label for the option
+   * @return configuration label name
+   */
+  public String getConfigLabel() {
+    return confLabel;
+  }
+
+  /**
+   * Get CLI Option corresponding to the distcp option
+   * @return option
+   */
+  public Option getOption() {
+    return option;
+  }
+
+  /**
+   * Get Switch symbol
+   * @return switch symbol char
+   */
+  public String getSwitch() {
+    return option.getOpt();
+  }
+
+  @Override
+  public String toString() {
+    return  super.name() + " {" +
+        "confLabel='" + confLabel + '\'' +
+        ", option=" + option + '}';
+  }
+
+  /**
+   * Helper function to add an option to hadoop configuration object
+   * @param conf - Configuration object to include the option
+   * @param option - Option to add
+   * @param value - Value
+   */
+  public static void addToConf(Configuration conf,
+                               DistCpOptionSwitch option,
+                               String value) {
+    conf.set(option.getConfigLabel(), value);
+  }
+
+  /**
+   * Helper function to set an option to hadoop configuration object
+   * @param conf - Configuration object to include the option
+   * @param option - Option to add
+   */
+  public static void addToConf(Configuration conf,
+                               DistCpOptionSwitch option) {
+    conf.set(option.getConfigLabel(), "true");
+  }
+}

+ 525 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java

@@ -0,0 +1,525 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.tools.util.DistCpUtils;
+
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+/**
+ * The Options class encapsulates all DistCp options.
+ * These may be set from command-line (via the OptionsParser)
+ * or may be set manually.
+ */
+public class DistCpOptions {
+
+  private boolean atomicCommit = false;
+  private boolean syncFolder = false;
+  private boolean deleteMissing = false;
+  private boolean ignoreFailures = false;
+  private boolean overwrite = false;
+  private boolean skipCRC = false;
+  private boolean blocking = true;
+
+  private int maxMaps = DistCpConstants.DEFAULT_MAPS;
+  private int mapBandwidth = DistCpConstants.DEFAULT_BANDWIDTH_MB;
+
+  private String sslConfigurationFile;
+
+  private String copyStrategy = DistCpConstants.UNIFORMSIZE;
+
+  private EnumSet<FileAttribute> preserveStatus = EnumSet.noneOf(FileAttribute.class);
+
+  private Path atomicWorkPath;
+
+  private Path logPath;
+
+  private Path sourceFileListing;
+  private List<Path> sourcePaths;
+
+  private Path targetPath;
+
+  public static enum FileAttribute{
+    REPLICATION, BLOCKSIZE, USER, GROUP, PERMISSION;
+
+    public static FileAttribute getAttribute(char symbol) {
+      for (FileAttribute attribute : values()) {
+        if (attribute.name().charAt(0) == Character.toUpperCase(symbol)) {
+          return attribute;
+        }
+      }
+      throw new NoSuchElementException("No attribute for " + symbol);
+    }
+  }
+
+  /**
+   * Constructor, to initialize source/target paths.
+   * @param sourcePaths List of source-paths (including wildcards)
+   *                     to be copied to target.
+   * @param targetPath Destination path for the dist-copy.
+   */
+  public DistCpOptions(List<Path> sourcePaths, Path targetPath) {
+    assert sourcePaths != null && !sourcePaths.isEmpty() : "Invalid source paths";
+    assert targetPath != null : "Invalid Target path";
+
+    this.sourcePaths = sourcePaths;
+    this.targetPath = targetPath;
+  }
+
+  /**
+   * Constructor, to initialize source/target paths.
+   * @param sourceFileListing File containing list of source paths
+   * @param targetPath Destination path for the dist-copy.
+   */
+  public DistCpOptions(Path sourceFileListing, Path targetPath) {
+    assert sourceFileListing != null : "Invalid source paths";
+    assert targetPath != null : "Invalid Target path";
+
+    this.sourceFileListing = sourceFileListing;
+    this.targetPath = targetPath;
+  }
+
+  /**
+   * Copy constructor.
+   * @param that DistCpOptions being copied from.
+   */
+  public DistCpOptions(DistCpOptions that) {
+    if (this != that && that != null) {
+      this.atomicCommit = that.atomicCommit;
+      this.syncFolder = that.syncFolder;
+      this.deleteMissing = that.deleteMissing;
+      this.ignoreFailures = that.ignoreFailures;
+      this.overwrite = that.overwrite;
+      this.skipCRC = that.skipCRC;
+      this.blocking = that.blocking;
+      this.maxMaps = that.maxMaps;
+      this.mapBandwidth = that.mapBandwidth;
+      this.sslConfigurationFile = that.getSslConfigurationFile();
+      this.copyStrategy = that.copyStrategy;
+      this.preserveStatus = that.preserveStatus;
+      this.atomicWorkPath = that.getAtomicWorkPath();
+      this.logPath = that.getLogPath();
+      this.sourceFileListing = that.getSourceFileListing();
+      this.sourcePaths = that.getSourcePaths();
+      this.targetPath = that.getTargetPath();
+    }
+  }
+
+  /**
+   * Should the data be committed atomically?
+   *
+   * @return true if data should be committed automically. false otherwise
+   */
+  public boolean shouldAtomicCommit() {
+    return atomicCommit;
+  }
+
+  /**
+   * Set if data need to be committed automatically
+   *
+   * @param atomicCommit - boolean switch
+   */
+  public void setAtomicCommit(boolean atomicCommit) {
+    validate(DistCpOptionSwitch.ATOMIC_COMMIT, atomicCommit);
+    this.atomicCommit = atomicCommit;
+  }
+
+  /**
+   * Should the data be sync'ed between source and target paths?
+   *
+   * @return true if data should be sync'ed up. false otherwise
+   */
+  public boolean shouldSyncFolder() {
+    return syncFolder;
+  }
+
+  /**
+   * Set if source and target folder contents be sync'ed up
+   *
+   * @param syncFolder - boolean switch
+   */
+  public void setSyncFolder(boolean syncFolder) {
+    validate(DistCpOptionSwitch.SYNC_FOLDERS, syncFolder);
+    this.syncFolder = syncFolder;
+  }
+
+  /**
+   * Should target files missing in source should be deleted?
+   *
+   * @return true if zoombie target files to be removed. false otherwise
+   */
+  public boolean shouldDeleteMissing() {
+    return deleteMissing;
+  }
+
+  /**
+   * Set if files only present in target should be deleted
+   *
+   * @param deleteMissing - boolean switch
+   */
+  public void setDeleteMissing(boolean deleteMissing) {
+    validate(DistCpOptionSwitch.DELETE_MISSING, deleteMissing);
+    this.deleteMissing = deleteMissing;
+  }
+
+  /**
+   * Should failures be logged and ignored during copy?
+   *
+   * @return true if failures are to be logged and ignored. false otherwise
+   */
+  public boolean shouldIgnoreFailures() {
+    return ignoreFailures;
+  }
+
+  /**
+   * Set if failures during copy be ignored
+   *
+   * @param ignoreFailures - boolean switch
+   */
+  public void setIgnoreFailures(boolean ignoreFailures) {
+    this.ignoreFailures = ignoreFailures;
+  }
+
+  /**
+   * Should DistCp be running in blocking mode
+   *
+   * @return true if should run in blocking, false otherwise
+   */
+  public boolean shouldBlock() {
+    return blocking;
+  }
+
+  /**
+   * Set if Disctp should run blocking or non-blocking
+   *
+   * @param blocking - boolean switch
+   */
+  public void setBlocking(boolean blocking) {
+    this.blocking = blocking;
+  }
+
+  /**
+   * Should files be overwritten always?
+   *
+   * @return true if files in target that may exist before distcp, should always
+   *         be overwritten. false otherwise
+   */
+  public boolean shouldOverwrite() {
+    return overwrite;
+  }
+
+  /**
+   * Set if files should always be overwritten on target
+   *
+   * @param overwrite - boolean switch
+   */
+  public void setOverwrite(boolean overwrite) {
+    validate(DistCpOptionSwitch.OVERWRITE, overwrite);
+    this.overwrite = overwrite;
+  }
+
+  /**
+   * Should CRC/checksum check be skipped while checking files are identical
+   *
+   * @return true if checksum check should be skipped while checking files are
+   *         identical. false otherwise
+   */
+  public boolean shouldSkipCRC() {
+    return skipCRC;
+  }
+
+  /**
+   * Set if checksum comparison should be skipped while determining if
+   * source and destination files are identical
+   *
+   * @param skipCRC - boolean switch
+   */
+  public void setSkipCRC(boolean skipCRC) {
+    validate(DistCpOptionSwitch.SKIP_CRC, skipCRC);
+    this.skipCRC = skipCRC;
+  }
+
+  /** Get the max number of maps to use for this copy
+   *
+   * @return Max number of maps
+   */
+  public int getMaxMaps() {
+    return maxMaps;
+  }
+
+  /**
+   * Set the max number of maps to use for copy
+   *
+   * @param maxMaps - Number of maps
+   */
+  public void setMaxMaps(int maxMaps) {
+    this.maxMaps = maxMaps;
+  }
+
+  /** Get the map bandwidth in MB
+   *
+   * @return Bandwidth in MB
+   */
+  public int getMapBandwidth() {
+    return mapBandwidth;
+  }
+
+  /**
+   * Set per map bandwidth
+   *
+   * @param mapBandwidth - per map bandwidth
+   */
+  public void setMapBandwidth(int mapBandwidth) {
+    assert mapBandwidth > 0 : "Bandwidth " + mapBandwidth + " is invalid (should be > 0)";
+    this.mapBandwidth = mapBandwidth;
+  }
+
+  /**
+   * Get path where the ssl configuration file is present to use for hftps://
+   *
+   * @return Path on local file system
+   */
+  public String getSslConfigurationFile() {
+    return sslConfigurationFile;
+  }
+
+  /**
+   * Set the SSL configuration file path to use with hftps:// (local path)
+   *
+   * @param sslConfigurationFile - Local ssl config file path
+   */
+  public void setSslConfigurationFile(String sslConfigurationFile) {
+    this.sslConfigurationFile = sslConfigurationFile;
+  }
+
+  /**
+   * Returns an iterator with the list of file attributes to preserve
+   *
+   * @return iterator of file attributes to preserve
+   */
+  public Iterator<FileAttribute> preserveAttributes() {
+    return preserveStatus.iterator();
+  }
+
+  /**
+   * Checks if the input attibute should be preserved or not
+   *
+   * @param attribute - Attribute to check
+   * @return True if attribute should be preserved, false otherwise
+   */
+  public boolean shouldPreserve(FileAttribute attribute) {
+    return preserveStatus.contains(attribute);
+  }
+
+  /**
+   * Add file attributes that need to be preserved. This method may be
+   * called multiple times to add attributes.
+   *
+   * @param fileAttribute - Attribute to add, one at a time
+   */
+  public void preserve(FileAttribute fileAttribute) {
+    for (FileAttribute attribute : preserveStatus) {
+      if (attribute.equals(fileAttribute)) {
+        return;
+      }
+    }
+    preserveStatus.add(fileAttribute);
+  }
+
+  /** Get work path for atomic commit. If null, the work
+   * path would be parentOf(targetPath) + "/._WIP_" + nameOf(targetPath)
+   *
+   * @return Atomic work path on the target cluster. Null if not set
+   */
+  public Path getAtomicWorkPath() {
+    return atomicWorkPath;
+  }
+
+  /**
+   * Set the work path for atomic commit
+   *
+   * @param atomicWorkPath - Path on the target cluster
+   */
+  public void setAtomicWorkPath(Path atomicWorkPath) {
+    this.atomicWorkPath = atomicWorkPath;
+  }
+
+  /** Get output directory for writing distcp logs. Otherwise logs
+   * are temporarily written to JobStagingDir/_logs and deleted
+   * upon job completion
+   *
+   * @return Log output path on the cluster where distcp job is run
+   */
+  public Path getLogPath() {
+    return logPath;
+  }
+
+  /**
+   * Set the log path where distcp output logs are stored
+   * Uses JobStagingDir/_logs by default
+   *
+   * @param logPath - Path where logs will be saved
+   */
+  public void setLogPath(Path logPath) {
+    this.logPath = logPath;
+  }
+
+  /**
+   * Get the copy strategy to use. Uses appropriate input format
+   *
+   * @return copy strategy to use
+   */
+  public String getCopyStrategy() {
+    return copyStrategy;
+  }
+
+  /**
+   * Set the copy strategy to use. Should map to a strategy implementation
+   * in distp-default.xml
+   *
+   * @param copyStrategy - copy Strategy to use
+   */
+  public void setCopyStrategy(String copyStrategy) {
+    this.copyStrategy = copyStrategy;
+  }
+
+  /**
+   * File path (hdfs:// or file://) that contains the list of actual
+   * files to copy
+   *
+   * @return - Source listing file path
+   */
+  public Path getSourceFileListing() {
+    return sourceFileListing;
+  }
+
+  /**
+   * Getter for sourcePaths.
+   * @return List of source-paths.
+   */
+  public List<Path> getSourcePaths() {
+    return sourcePaths;
+  }
+
+  /**
+   * Setter for sourcePaths.
+   * @param sourcePaths The new list of source-paths.
+   */
+  public void setSourcePaths(List<Path> sourcePaths) {
+    assert sourcePaths != null && sourcePaths.size() != 0;
+    this.sourcePaths = sourcePaths;
+  }
+
+  /**
+   * Getter for the targetPath.
+   * @return The target-path.
+   */
+  public Path getTargetPath() {
+    return targetPath;
+  }
+
+  public void validate(DistCpOptionSwitch option, boolean value) {
+
+    boolean syncFolder = (option == DistCpOptionSwitch.SYNC_FOLDERS ?
+        value : this.syncFolder);
+    boolean overwrite = (option == DistCpOptionSwitch.OVERWRITE ?
+        value : this.overwrite);
+    boolean deleteMissing = (option == DistCpOptionSwitch.DELETE_MISSING ?
+        value : this.deleteMissing);
+    boolean atomicCommit = (option == DistCpOptionSwitch.ATOMIC_COMMIT ?
+        value : this.atomicCommit);
+    boolean skipCRC = (option == DistCpOptionSwitch.SKIP_CRC ?
+        value : this.skipCRC);
+
+    if (syncFolder && atomicCommit) {
+      throw new IllegalArgumentException("Atomic commit can't be used with " +
+          "sync folder or overwrite options");
+    }
+
+    if (deleteMissing && !(overwrite || syncFolder)) {
+      throw new IllegalArgumentException("Delete missing is applicable " +
+          "only with update or overwrite options");
+    }
+
+    if (overwrite && syncFolder) {
+      throw new IllegalArgumentException("Overwrite and update options are " +
+          "mutually exclusive");
+    }
+
+    if (!syncFolder && skipCRC) {
+      throw new IllegalArgumentException("Skip CRC is valid only with update options");
+    }
+
+  }
+
+  /**
+   * Add options to configuration. These will be used in the Mapper/committer
+   *
+   * @param conf - Configruation object to which the options need to be added
+   */
+  public void appendToConf(Configuration conf) {
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.ATOMIC_COMMIT,
+        String.valueOf(atomicCommit));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.IGNORE_FAILURES,
+        String.valueOf(ignoreFailures));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SYNC_FOLDERS,
+        String.valueOf(syncFolder));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.DELETE_MISSING,
+        String.valueOf(deleteMissing));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.OVERWRITE,
+        String.valueOf(overwrite));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SKIP_CRC,
+        String.valueOf(skipCRC));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.BANDWIDTH,
+        String.valueOf(mapBandwidth));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.PRESERVE_STATUS,
+        DistCpUtils.packAttributes(preserveStatus));
+  }
+
+  /**
+   * Utility to easily string-ify Options, for logging.
+   *
+   * @return String representation of the Options.
+   */
+  @Override
+  public String toString() {
+    return "DistCpOptions{" +
+        "atomicCommit=" + atomicCommit +
+        ", syncFolder=" + syncFolder +
+        ", deleteMissing=" + deleteMissing +
+        ", ignoreFailures=" + ignoreFailures +
+        ", maxMaps=" + maxMaps +
+        ", sslConfigurationFile='" + sslConfigurationFile + '\'' +
+        ", copyStrategy='" + copyStrategy + '\'' +
+        ", sourceFileListing=" + sourceFileListing +
+        ", sourcePaths=" + sourcePaths +
+        ", targetPath=" + targetPath +
+        '}';
+  }
+
+  @Override
+  protected DistCpOptions clone() throws CloneNotSupportedException {
+    return (DistCpOptions) super.clone();
+  }
+}

+ 100 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/FileBasedCopyListing.java

@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.security.Credentials;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * FileBasedCopyListing implements the CopyListing interface,
+ * to create the copy-listing for DistCp,
+ * by iterating over all source paths mentioned in a specified input-file.
+ */
+public class FileBasedCopyListing extends CopyListing {
+
+  private final CopyListing globbedListing;
+  /**
+   * Constructor, to initialize base-class.
+   * @param configuration The input Configuration object.
+   * @param credentials - Credentials object on which the FS delegation tokens are cached. If null
+   * delegation token caching is skipped
+   */
+  public FileBasedCopyListing(Configuration configuration, Credentials credentials) {
+    super(configuration, credentials);
+    globbedListing = new GlobbedCopyListing(getConf(), credentials);
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected void validatePaths(DistCpOptions options)
+      throws IOException, InvalidInputException {
+  }
+
+  /**
+   * Implementation of CopyListing::buildListing().
+   *   Iterates over all source paths mentioned in the input-file.
+   * @param pathToListFile Path on HDFS where the listing file is written.
+   * @param options Input Options for DistCp (indicating source/target paths.)
+   * @throws IOException
+   */
+  @Override
+  public void doBuildListing(Path pathToListFile, DistCpOptions options) throws IOException {
+    DistCpOptions newOption = new DistCpOptions(options);
+    newOption.setSourcePaths(fetchFileList(options.getSourceFileListing()));
+    globbedListing.buildListing(pathToListFile, newOption);
+  }
+
+  private List<Path> fetchFileList(Path sourceListing) throws IOException {
+    List<Path> result = new ArrayList<Path>();
+    FileSystem fs = sourceListing.getFileSystem(getConf());
+    BufferedReader input = null;
+    try {
+      input = new BufferedReader(new InputStreamReader(fs.open(sourceListing)));
+      String line = input.readLine();
+      while (line != null) {
+        result.add(new Path(line));
+        line = input.readLine();
+      }
+    } finally {
+      IOUtils.closeStream(input);
+    }
+    return result;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getBytesToCopy() {
+    return globbedListing.getBytesToCopy();
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getNumberOfPaths() {
+    return globbedListing.getNumberOfPaths();
+  }
+}

+ 105 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java

@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.security.Credentials;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * GlobbedCopyListing implements the CopyListing interface, to create the copy
+ * listing-file by "globbing" all specified source paths (wild-cards and all.)
+ */
+public class GlobbedCopyListing extends CopyListing {
+  private static final Log LOG = LogFactory.getLog(GlobbedCopyListing.class);
+
+  private final CopyListing simpleListing;
+  /**
+   * Constructor, to initialize the configuration.
+   * @param configuration The input Configuration object.
+   * @param credentials Credentials object on which the FS delegation tokens are cached. If null
+   * delegation token caching is skipped
+   */
+  public GlobbedCopyListing(Configuration configuration, Credentials credentials) {
+    super(configuration, credentials);
+    simpleListing = new SimpleCopyListing(getConf(), credentials) ;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected void validatePaths(DistCpOptions options)
+      throws IOException, InvalidInputException {
+  }
+
+  /**
+   * Implementation of CopyListing::buildListing().
+   * Creates the copy listing by "globbing" all source-paths.
+   * @param pathToListingFile The location at which the copy-listing file
+   *                           is to be created.
+   * @param options Input Options for DistCp (indicating source/target paths.)
+   * @throws IOException
+   */
+  @Override
+  public void doBuildListing(Path pathToListingFile,
+                             DistCpOptions options) throws IOException {
+
+    List<Path> globbedPaths = new ArrayList<Path>();
+    if (options.getSourcePaths().isEmpty()) {
+      throw new InvalidInputException("Nothing to process. Source paths::EMPTY");  
+    }
+
+    for (Path p : options.getSourcePaths()) {
+      FileSystem fs = p.getFileSystem(getConf());
+      FileStatus[] inputs = fs.globStatus(p);
+
+      if(inputs != null && inputs.length > 0) {
+        for (FileStatus onePath: inputs) {
+          globbedPaths.add(onePath.getPath());
+        }
+      } else {
+        throw new InvalidInputException(p + " doesn't exist");        
+      }
+    }
+
+    DistCpOptions optionsGlobbed = new DistCpOptions(options);
+    optionsGlobbed.setSourcePaths(globbedPaths);
+    simpleListing.buildListing(pathToListingFile, optionsGlobbed);
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getBytesToCopy() {
+    return simpleListing.getBytesToCopy();
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getNumberOfPaths() {
+    return simpleListing.getNumberOfPaths();
+  }
+
+}

+ 246 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java

@@ -0,0 +1,246 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.cli.*;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+
+import java.util.*;
+
+/**
+ * The OptionsParser parses out the command-line options passed to DistCp,
+ * and interprets those specific to DistCp, to create an Options object.
+ */
+public class OptionsParser {
+
+  private static final Log LOG = LogFactory.getLog(OptionsParser.class);
+
+  private static final Options cliOptions = new Options();      
+
+  static {
+    for (DistCpOptionSwitch option : DistCpOptionSwitch.values()) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Adding option " + option.getOption());
+      }
+      cliOptions.addOption(option.getOption());
+    }
+  }
+
+  private static class CustomParser extends GnuParser {
+    @Override
+    protected String[] flatten(Options options, String[] arguments, boolean stopAtNonOption) {
+      for (int index = 0; index < arguments.length; index++) {
+        if (arguments[index].equals("-" + DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) {
+          arguments[index] = "-prbugp";
+        }
+      }
+      return super.flatten(options, arguments, stopAtNonOption);
+    }
+  }
+
+  /**
+   * The parse method parses the command-line options, and creates
+   * a corresponding Options object.
+   * @param args Command-line arguments (excluding the options consumed
+   *              by the GenericOptionsParser).
+   * @return The Options object, corresponding to the specified command-line.
+   * @throws IllegalArgumentException: Thrown if the parse fails.
+   */
+  public static DistCpOptions parse(String args[]) throws IllegalArgumentException {
+
+    CommandLineParser parser = new CustomParser();
+
+    CommandLine command;
+    try {
+      command = parser.parse(cliOptions, args, true);
+    } catch (ParseException e) {
+      throw new IllegalArgumentException("Unable to parse arguments. " +
+        Arrays.toString(args), e);
+    }
+
+    DistCpOptions option;
+    Path targetPath;
+    List<Path> sourcePaths = new ArrayList<Path>();
+
+    String leftOverArgs[] = command.getArgs();
+    if (leftOverArgs == null || leftOverArgs.length < 1) {
+      throw new IllegalArgumentException("Target path not specified");
+    }
+
+    //Last Argument is the target path
+    targetPath = new Path(leftOverArgs[leftOverArgs.length -1].trim());
+
+    //Copy any source paths in the arguments to the list
+    for (int index = 0; index < leftOverArgs.length - 1; index++) {
+      sourcePaths.add(new Path(leftOverArgs[index].trim()));
+    }
+
+    /* If command has source file listing, use it else, fall back on source paths in args
+       If both are present, throw exception and bail */
+    if (command.hasOption(DistCpOptionSwitch.SOURCE_FILE_LISTING.getSwitch())) {
+      if (!sourcePaths.isEmpty()) {
+        throw new IllegalArgumentException("Both source file listing and source paths present");
+      }
+      option = new DistCpOptions(new Path(getVal(command, DistCpOptionSwitch.
+              SOURCE_FILE_LISTING.getSwitch())), targetPath);
+    } else {
+      if (sourcePaths.isEmpty()) {
+        throw new IllegalArgumentException("Neither source file listing nor source paths present");
+      }
+      option = new DistCpOptions(sourcePaths, targetPath);
+    }
+
+    //Process all the other option switches and set options appropriately
+    if (command.hasOption(DistCpOptionSwitch.IGNORE_FAILURES.getSwitch())) {
+      option.setIgnoreFailures(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.ATOMIC_COMMIT.getSwitch())) {
+      option.setAtomicCommit(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch()) &&
+        option.shouldAtomicCommit()) {
+      String workPath = getVal(command, DistCpOptionSwitch.WORK_PATH.getSwitch());
+      if (workPath != null && !workPath.isEmpty()) {
+        option.setAtomicWorkPath(new Path(workPath));
+      }
+    } else if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch())) {
+      throw new IllegalArgumentException("-tmp work-path can only be specified along with -atomic");      
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.LOG_PATH.getSwitch())) {
+      option.setLogPath(new Path(getVal(command, DistCpOptionSwitch.LOG_PATH.getSwitch())));
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.SYNC_FOLDERS.getSwitch())) {
+      option.setSyncFolder(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.OVERWRITE.getSwitch())) {
+      option.setOverwrite(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.DELETE_MISSING.getSwitch())) {
+      option.setDeleteMissing(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.SKIP_CRC.getSwitch())) {
+      option.setSkipCRC(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.BLOCKING.getSwitch())) {
+      option.setBlocking(false);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.BANDWIDTH.getSwitch())) {
+      try {
+        Integer mapBandwidth = Integer.parseInt(
+            getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()).trim());
+        option.setMapBandwidth(mapBandwidth);
+      } catch (NumberFormatException e) {
+        throw new IllegalArgumentException("Bandwidth specified is invalid: " +
+            getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()), e);
+      }
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.SSL_CONF.getSwitch())) {
+      option.setSslConfigurationFile(command.
+          getOptionValue(DistCpOptionSwitch.SSL_CONF.getSwitch()));
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.MAX_MAPS.getSwitch())) {
+      try {
+        Integer maps = Integer.parseInt(
+            getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()).trim());
+        option.setMaxMaps(maps);
+      } catch (NumberFormatException e) {
+        throw new IllegalArgumentException("Number of maps is invalid: " +
+            getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()), e);
+      }
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.COPY_STRATEGY.getSwitch())) {
+      option.setCopyStrategy(
+            getVal(command, DistCpOptionSwitch.COPY_STRATEGY.getSwitch()));
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) {
+      String attributes =
+          getVal(command, DistCpOptionSwitch.PRESERVE_STATUS.getSwitch());
+      if (attributes == null || attributes.isEmpty()) {
+        for (FileAttribute attribute : FileAttribute.values()) {
+          option.preserve(attribute);
+        }
+      } else {
+        for (int index = 0; index < attributes.length(); index++) {
+          option.preserve(FileAttribute.
+              getAttribute(attributes.charAt(index)));
+        }
+      }
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.FILE_LIMIT.getSwitch())) {
+      String fileLimitString = getVal(command,
+                              DistCpOptionSwitch.FILE_LIMIT.getSwitch().trim());
+      try {
+        Integer.parseInt(fileLimitString);
+      }
+      catch (NumberFormatException e) {
+        throw new IllegalArgumentException("File-limit is invalid: "
+                                            + fileLimitString, e);
+      }
+      LOG.warn(DistCpOptionSwitch.FILE_LIMIT.getSwitch() + " is a deprecated" +
+              " option. Ignoring.");
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.SIZE_LIMIT.getSwitch())) {
+      String sizeLimitString = getVal(command,
+                              DistCpOptionSwitch.SIZE_LIMIT.getSwitch().trim());
+      try {
+        Long.parseLong(sizeLimitString);
+      }
+      catch (NumberFormatException e) {
+        throw new IllegalArgumentException("Size-limit is invalid: "
+                                            + sizeLimitString, e);
+      }
+      LOG.warn(DistCpOptionSwitch.SIZE_LIMIT.getSwitch() + " is a deprecated" +
+              " option. Ignoring.");
+    }
+
+    return option;
+  }
+
+  private static String getVal(CommandLine command, String swtch) {
+    String optionValue = command.getOptionValue(swtch);
+    if (optionValue == null) {
+      return null;
+    } else {
+      return optionValue.trim();
+    }
+  }
+
+  public static void usage() {
+    HelpFormatter formatter = new HelpFormatter();
+    formatter.printHelp("distcp OPTIONS [source_path...] <target_path>\n\nOPTIONS", cliOptions);
+  }
+}

+ 275 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java

@@ -0,0 +1,275 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.mapreduce.security.TokenCache;
+import org.apache.hadoop.security.Credentials;
+
+import java.io.*;
+import java.util.Stack;
+
+/**
+ * The SimpleCopyListing is responsible for making the exhaustive list of
+ * all files/directories under its specified list of input-paths.
+ * These are written into the specified copy-listing file.
+ * Note: The SimpleCopyListing doesn't handle wild-cards in the input-paths.
+ */
+public class SimpleCopyListing extends CopyListing {
+  private static final Log LOG = LogFactory.getLog(SimpleCopyListing.class);
+
+  private long totalPaths = 0;
+  private long totalBytesToCopy = 0;
+
+  /**
+   * Protected constructor, to initialize configuration.
+   *
+   * @param configuration The input configuration, with which the source/target FileSystems may be accessed.
+   * @param credentials - Credentials object on which the FS delegation tokens are cached. If null
+   * delegation token caching is skipped
+   */
+  protected SimpleCopyListing(Configuration configuration, Credentials credentials) {
+    super(configuration, credentials);
+  }
+
+  @Override
+  protected void validatePaths(DistCpOptions options)
+      throws IOException, InvalidInputException {
+
+    Path targetPath = options.getTargetPath();
+    FileSystem targetFS = targetPath.getFileSystem(getConf());
+    boolean targetIsFile = targetFS.isFile(targetPath);
+
+    //If target is a file, then source has to be single file
+    if (targetIsFile) {
+      if (options.getSourcePaths().size() > 1) {
+        throw new InvalidInputException("Multiple source being copied to a file: " +
+            targetPath);
+      }
+
+      Path srcPath = options.getSourcePaths().get(0);
+      FileSystem sourceFS = srcPath.getFileSystem(getConf());
+      if (!sourceFS.isFile(srcPath)) {
+        throw new InvalidInputException("Cannot copy " + srcPath +
+            ", which is not a file to " + targetPath);
+      }
+    }
+
+    if (options.shouldAtomicCommit() && targetFS.exists(targetPath)) {
+      throw new InvalidInputException("Target path for atomic-commit already exists: " +
+        targetPath + ". Cannot atomic-commit to pre-existing target-path.");
+    }
+
+    for (Path path: options.getSourcePaths()) {
+      FileSystem fs = path.getFileSystem(getConf());
+      if (!fs.exists(path)) {
+        throw new InvalidInputException(path + " doesn't exist");
+      }
+    }
+
+    /* This is requires to allow map tasks to access each of the source
+       clusters. This would retrieve the delegation token for each unique
+       file system and add them to job's private credential store
+     */
+    Credentials credentials = getCredentials();
+    if (credentials != null) {
+      Path[] inputPaths = options.getSourcePaths().toArray(new Path[1]);
+      TokenCache.obtainTokensForNamenodes(credentials, inputPaths, getConf());
+    }
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException {
+
+    SequenceFile.Writer fileListWriter = null;
+
+    try {
+      fileListWriter = getWriter(pathToListingFile);
+
+      for (Path path: options.getSourcePaths()) {
+        FileSystem sourceFS = path.getFileSystem(getConf());
+        path = makeQualified(path);
+
+        FileStatus rootStatus = sourceFS.getFileStatus(path);
+        Path sourcePathRoot = computeSourceRootPath(rootStatus, options);
+        boolean localFile = (rootStatus.getClass() != FileStatus.class);
+
+        FileStatus[] sourceFiles = sourceFS.listStatus(path);
+        if (sourceFiles != null && sourceFiles.length > 0) {
+          for (FileStatus sourceStatus: sourceFiles) {
+            if (LOG.isDebugEnabled()) {
+              LOG.debug("Recording source-path: " + sourceStatus.getPath() + " for copy.");
+            }
+            writeToFileListing(fileListWriter, sourceStatus, sourcePathRoot, localFile);
+
+            if (isDirectoryAndNotEmpty(sourceFS, sourceStatus)) {
+              if (LOG.isDebugEnabled()) {
+                LOG.debug("Traversing non-empty source dir: " + sourceStatus.getPath());
+              }
+              traverseNonEmptyDirectory(fileListWriter, sourceStatus, sourcePathRoot, localFile);
+            }
+          }
+        } else {
+          writeToFileListing(fileListWriter, rootStatus, sourcePathRoot, localFile);
+        }
+      }
+    } finally {
+      IOUtils.closeStream(fileListWriter);
+    }
+  }
+
+  private Path computeSourceRootPath(FileStatus sourceStatus,
+                                     DistCpOptions options) throws IOException {
+
+    Path target = options.getTargetPath();
+    FileSystem targetFS = target.getFileSystem(getConf());
+
+    boolean solitaryFile = options.getSourcePaths().size() == 1
+                                                && !sourceStatus.isDirectory();
+
+    if (solitaryFile) {
+      if (targetFS.isFile(target) || !targetFS.exists(target)) {
+        return sourceStatus.getPath();
+      } else {
+        return sourceStatus.getPath().getParent();
+      }
+    } else {
+      boolean specialHandling = (options.getSourcePaths().size() == 1 && !targetFS.exists(target)) ||
+          options.shouldSyncFolder() || options.shouldOverwrite();
+
+      return specialHandling && sourceStatus.isDirectory() ? sourceStatus.getPath() :
+          sourceStatus.getPath().getParent();
+    }
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getBytesToCopy() {
+    return totalBytesToCopy;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getNumberOfPaths() {
+    return totalPaths;
+  }
+
+  private Path makeQualified(Path path) throws IOException {
+    final FileSystem fs = path.getFileSystem(getConf());
+    return path.makeQualified(fs.getUri(), fs.getWorkingDirectory());
+  }
+
+  private SequenceFile.Writer getWriter(Path pathToListFile) throws IOException {
+    FileSystem fs = pathToListFile.getFileSystem(getConf());
+    if (fs.exists(pathToListFile)) {
+      fs.delete(pathToListFile, false);
+    }
+    return SequenceFile.createWriter(getConf(),
+            SequenceFile.Writer.file(pathToListFile),
+            SequenceFile.Writer.keyClass(Text.class),
+            SequenceFile.Writer.valueClass(FileStatus.class),
+            SequenceFile.Writer.compression(SequenceFile.CompressionType.NONE));
+  }
+
+  private static boolean isDirectoryAndNotEmpty(FileSystem fileSystem,
+                                    FileStatus fileStatus) throws IOException {
+    return fileStatus.isDirectory() && getChildren(fileSystem, fileStatus).length > 0;
+  }
+
+  private static FileStatus[] getChildren(FileSystem fileSystem,
+                                         FileStatus parent) throws IOException {
+    return fileSystem.listStatus(parent.getPath());
+  }
+
+  private void traverseNonEmptyDirectory(SequenceFile.Writer fileListWriter,
+                                         FileStatus sourceStatus,
+                                         Path sourcePathRoot, boolean localFile)
+                                         throws IOException {
+    FileSystem sourceFS = sourcePathRoot.getFileSystem(getConf());
+    Stack<FileStatus> pathStack = new Stack<FileStatus>();
+    pathStack.push(sourceStatus);
+
+    while (!pathStack.isEmpty()) {
+      for (FileStatus child: getChildren(sourceFS, pathStack.pop())) {
+        if (LOG.isDebugEnabled())
+          LOG.debug("Recording source-path: "
+                    + sourceStatus.getPath() + " for copy.");
+        writeToFileListing(fileListWriter, child, sourcePathRoot, localFile);
+        if (isDirectoryAndNotEmpty(sourceFS, child)) {
+          if (LOG.isDebugEnabled())
+            LOG.debug("Traversing non-empty source dir: "
+                       + sourceStatus.getPath());
+          pathStack.push(child);
+        }
+      }
+    }
+  }
+
+  private void writeToFileListing(SequenceFile.Writer fileListWriter,
+                                  FileStatus fileStatus, Path sourcePathRoot,
+                                  boolean localFile) throws IOException {
+    if (fileStatus.getPath().equals(sourcePathRoot) && fileStatus.isDirectory())
+      return; // Skip the root-paths.
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("REL PATH: " + DistCpUtils.getRelativePath(sourcePathRoot,
+        fileStatus.getPath()) + ", FULL PATH: " + fileStatus.getPath());
+    }
+
+    FileStatus status = fileStatus;
+    if (localFile) {
+      status = getFileStatus(fileStatus);
+    }
+
+    fileListWriter.append(new Text(DistCpUtils.getRelativePath(sourcePathRoot,
+        fileStatus.getPath())), status);
+    fileListWriter.sync();
+
+    if (!fileStatus.isDirectory()) {
+      totalBytesToCopy += fileStatus.getLen();
+    }
+    totalPaths++;
+  }
+
+  private static final ByteArrayOutputStream buffer = new ByteArrayOutputStream(64);
+  private DataInputBuffer in = new DataInputBuffer();
+  
+  private FileStatus getFileStatus(FileStatus fileStatus) throws IOException {
+    FileStatus status = new FileStatus();
+
+    buffer.reset();
+    DataOutputStream out = new DataOutputStream(buffer);
+    fileStatus.write(out);
+
+    in.reset(buffer.toByteArray(), 0, buffer.size());
+    status.readFields(in);
+    return status;
+  }
+}

+ 297 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java

@@ -0,0 +1,297 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
+import org.apache.hadoop.tools.*;
+import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+import org.apache.hadoop.tools.util.DistCpUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.List;
+
+/**
+ * The CopyCommitter class is DistCp's OutputCommitter implementation. It is
+ * responsible for handling the completion/cleanup of the DistCp run.
+ * Specifically, it does the following:
+ *  1. Cleanup of the meta-folder (where DistCp maintains its file-list, etc.)
+ *  2. Preservation of user/group/replication-factor on any directories that
+ *     have been copied. (Files are taken care of in their map-tasks.)
+ *  3. Atomic-move of data from the temporary work-folder to the final path
+ *     (if atomic-commit was opted for).
+ *  4. Deletion of files from the target that are missing at source (if opted for).
+ *  5. Cleanup of any partially copied files, from previous, failed attempts.
+ */
+public class CopyCommitter extends FileOutputCommitter {
+  private static final Log LOG = LogFactory.getLog(CopyCommitter.class);
+
+  private final TaskAttemptContext taskAttemptContext;
+
+  /**
+   * Create a output committer
+   *
+   * @param outputPath the job's output path
+   * @param context    the task's context
+   * @throws IOException - Exception if any
+   */
+  public CopyCommitter(Path outputPath, TaskAttemptContext context) throws IOException {
+    super(outputPath, context);
+    this.taskAttemptContext = context;
+  }
+
+  /** @inheritDoc */
+  @Override
+  public void commitJob(JobContext jobContext) throws IOException {
+    Configuration conf = jobContext.getConfiguration();
+    super.commitJob(jobContext);
+
+    cleanupTempFiles(jobContext);
+
+    String attributes = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
+    if (attributes != null && !attributes.isEmpty()) {
+      preserveFileAttributesForDirectories(conf);
+    }
+
+    try {
+      if (conf.getBoolean(DistCpConstants.CONF_LABEL_DELETE_MISSING, false)) {
+        deleteMissing(conf);
+      } else if (conf.getBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, false)) {
+        commitData(conf);
+      }
+      taskAttemptContext.setStatus("Commit Successful");
+    }
+    finally {
+      cleanup(conf);
+    }
+  }
+
+  /** @inheritDoc */
+  @Override
+  public void abortJob(JobContext jobContext,
+                       JobStatus.State state) throws IOException {
+    try {
+      super.abortJob(jobContext, state);
+    } finally {
+      cleanupTempFiles(jobContext);
+      cleanup(jobContext.getConfiguration());
+    }
+  }
+
+  private void cleanupTempFiles(JobContext context) {
+    try {
+      Configuration conf = context.getConfiguration();
+
+      Path targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+      FileSystem targetFS = targetWorkPath.getFileSystem(conf);
+
+      String jobId = context.getJobID().toString();
+      deleteAttemptTempFiles(targetWorkPath, targetFS, jobId);
+      deleteAttemptTempFiles(targetWorkPath.getParent(), targetFS, jobId);
+    } catch (Throwable t) {
+      LOG.warn("Unable to cleanup temp files", t);
+    }
+  }
+
+  private void deleteAttemptTempFiles(Path targetWorkPath,
+                                      FileSystem targetFS,
+                                      String jobId) throws IOException {
+
+    FileStatus[] tempFiles = targetFS.globStatus(
+        new Path(targetWorkPath, ".distcp.tmp." + jobId.replaceAll("job","attempt") + "*"));
+
+    if (tempFiles != null && tempFiles.length > 0) {
+      for (FileStatus file : tempFiles) {
+        LOG.info("Cleaning up " + file.getPath());
+        targetFS.delete(file.getPath(), false);
+      }
+    }
+  }
+
+  /**
+   * Cleanup meta folder and other temporary files
+   *
+   * @param conf - Job Configuration
+   */
+  private void cleanup(Configuration conf) {
+    Path metaFolder = new Path(conf.get(DistCpConstants.CONF_LABEL_META_FOLDER));
+    try {
+      FileSystem fs = metaFolder.getFileSystem(conf);
+      LOG.info("Cleaning up temporary work folder: " + metaFolder);
+      fs.delete(metaFolder, true);
+    } catch (IOException ignore) {
+      LOG.error("Exception encountered ", ignore);
+    }
+  }
+
+  // This method changes the target-directories' file-attributes (owner,
+  // user/group permissions, etc.) based on the corresponding source directories.
+  private void preserveFileAttributesForDirectories(Configuration conf) throws IOException {
+    String attrSymbols = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
+    LOG.info("About to preserve attributes: " + attrSymbols);
+
+    EnumSet<FileAttribute> attributes = DistCpUtils.unpackAttributes(attrSymbols);
+
+    Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
+    FileSystem clusterFS = sourceListing.getFileSystem(conf);
+    SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf,
+                                      SequenceFile.Reader.file(sourceListing));
+    long totalLen = clusterFS.getFileStatus(sourceListing).getLen();
+
+    Path targetRoot = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+
+    long preservedEntries = 0;
+    try {
+      FileStatus srcFileStatus = new FileStatus();
+      Text srcRelPath = new Text();
+
+      // Iterate over every source path that was copied.
+      while (sourceReader.next(srcRelPath, srcFileStatus)) {
+        // File-attributes for files are set at the time of copy,
+        // in the map-task.
+        if (! srcFileStatus.isDirectory()) continue;
+
+        Path targetFile = new Path(targetRoot.toString() + "/" + srcRelPath);
+
+        // Skip the root folder.
+        // Status can't be preserved on root-folder. (E.g. multiple paths may
+        // be copied to a single target folder. Which source-attributes to use
+        // on the target is undefined.)
+        if (targetRoot.equals(targetFile)) continue;
+
+        FileSystem targetFS = targetFile.getFileSystem(conf);
+        DistCpUtils.preserve(targetFS, targetFile, srcFileStatus,  attributes);
+
+        taskAttemptContext.progress();
+        taskAttemptContext.setStatus("Preserving status on directory entries. [" +
+            sourceReader.getPosition() * 100 / totalLen + "%]");
+      }
+    } finally {
+      IOUtils.closeStream(sourceReader);
+    }
+    LOG.info("Preserved status on " + preservedEntries + " dir entries on target");
+  }
+
+  // This method deletes "extra" files from the target, if they're not
+  // available at the source.
+  private void deleteMissing(Configuration conf) throws IOException {
+    LOG.info("-delete option is enabled. About to remove entries from " +
+        "target that are missing in source");
+
+    // Sort the source-file listing alphabetically.
+    Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
+    FileSystem clusterFS = sourceListing.getFileSystem(conf);
+    Path sortedSourceListing = DistCpUtils.sortListing(clusterFS, conf, sourceListing);
+
+    // Similarly, create the listing of target-files. Sort alphabetically.
+    Path targetListing = new Path(sourceListing.getParent(), "targetListing.seq");
+    CopyListing target = new GlobbedCopyListing(new Configuration(conf), null);
+
+    List<Path> targets = new ArrayList<Path>(1);
+    Path targetFinalPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
+    targets.add(targetFinalPath);
+    DistCpOptions options = new DistCpOptions(targets, new Path("/NONE"));
+
+    target.buildListing(targetListing, options);
+    Path sortedTargetListing = DistCpUtils.sortListing(clusterFS, conf, targetListing);
+    long totalLen = clusterFS.getFileStatus(sortedTargetListing).getLen();
+
+    SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf,
+                                 SequenceFile.Reader.file(sortedSourceListing));
+    SequenceFile.Reader targetReader = new SequenceFile.Reader(conf,
+                                 SequenceFile.Reader.file(sortedTargetListing));
+
+    // Walk both source and target file listings.
+    // Delete all from target that doesn't also exist on source.
+    long deletedEntries = 0;
+    try {
+      FileStatus srcFileStatus = new FileStatus();
+      Text srcRelPath = new Text();
+      FileStatus trgtFileStatus = new FileStatus();
+      Text trgtRelPath = new Text();
+
+      FileSystem targetFS = targetFinalPath.getFileSystem(conf);
+      boolean srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
+      while (targetReader.next(trgtRelPath, trgtFileStatus)) {
+        // Skip sources that don't exist on target.
+        while (srcAvailable && trgtRelPath.compareTo(srcRelPath) > 0) {
+          srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
+        }
+
+        if (srcAvailable && trgtRelPath.equals(srcRelPath)) continue;
+
+        // Target doesn't exist at source. Delete.
+        boolean result = (!targetFS.exists(trgtFileStatus.getPath()) ||
+            targetFS.delete(trgtFileStatus.getPath(), true));
+        if (result) {
+          LOG.info("Deleted " + trgtFileStatus.getPath() + " - Missing at source");
+          deletedEntries++;
+        } else {
+          throw new IOException("Unable to delete " + trgtFileStatus.getPath());
+        }
+        taskAttemptContext.progress();
+        taskAttemptContext.setStatus("Deleting missing files from target. [" +
+            targetReader.getPosition() * 100 / totalLen + "%]");
+      }
+    } finally {
+      IOUtils.closeStream(sourceReader);
+      IOUtils.closeStream(targetReader);
+    }
+    LOG.info("Deleted " + deletedEntries + " from target: " + targets.get(0));
+  }
+
+  private void commitData(Configuration conf) throws IOException {
+
+    Path workDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+    Path finalDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
+    FileSystem targetFS = workDir.getFileSystem(conf);
+
+    LOG.info("Atomic commit enabled. Moving " + workDir + " to " + finalDir);
+    if (targetFS.exists(finalDir) && targetFS.exists(workDir)) {
+      LOG.error("Pre-existing final-path found at: " + finalDir);
+      throw new IOException("Target-path can't be committed to because it " +
+          "exists at " + finalDir + ". Copied data is in temp-dir: " + workDir + ". ");
+    }
+
+    boolean result = targetFS.rename(workDir, finalDir);
+    if (!result) {
+      LOG.warn("Rename failed. Perhaps data already moved. Verifying...");
+      result = targetFS.exists(finalDir) && !targetFS.exists(workDir);
+    }
+    if (result) {
+      LOG.info("Data committed successfully to " + finalDir);
+      taskAttemptContext.setStatus("Data committed successfully to " + finalDir);
+    } else {
+      LOG.error("Unable to commit data to " + finalDir);
+      throw new IOException("Atomic commit failed. Temporary data in " + workDir +
+        ", Unable to move to " + finalDir);
+    }
+  }
+}

+ 330 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyMapper.java

@@ -0,0 +1,330 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.DistCpOptionSwitch;
+import org.apache.hadoop.tools.DistCpOptions;
+import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.util.StringUtils;
+
+import java.io.*;
+import java.util.EnumSet;
+import java.util.Arrays;
+
+/**
+ * Mapper class that executes the DistCp copy operation.
+ * Implements the o.a.h.mapreduce.Mapper<> interface.
+ */
+public class CopyMapper extends Mapper<Text, FileStatus, Text, Text> {
+
+  /**
+   * Hadoop counters for the DistCp CopyMapper.
+   * (These have been kept identical to the old DistCp,
+   * for backward compatibility.)
+   */
+  public static enum Counter {
+    COPY,         // Number of files received by the mapper for copy.
+    SKIP,         // Number of files skipped.
+    FAIL,         // Number of files that failed to be copied.
+    BYTESCOPIED,  // Number of bytes actually copied by the copy-mapper, total.
+    BYTESEXPECTED,// Number of bytes expected to be copied.
+    BYTESFAILED,  // Number of bytes that failed to be copied.
+    BYTESSKIPPED, // Number of bytes that were skipped from copy.
+  }
+
+  private static Log LOG = LogFactory.getLog(CopyMapper.class);
+
+  private Configuration conf;
+
+  private boolean syncFolders = false;
+  private boolean ignoreFailures = false;
+  private boolean skipCrc = false;
+  private boolean overWrite = false;
+  private EnumSet<FileAttribute> preserve = EnumSet.noneOf(FileAttribute.class);
+
+  private FileSystem targetFS = null;
+  private Path    targetWorkPath = null;
+
+  /**
+   * Implementation of the Mapper::setup() method. This extracts the DistCp-
+   * options specified in the Job's configuration, to set up the Job.
+   * @param context Mapper's context.
+   * @throws IOException On IO failure.
+   * @throws InterruptedException If the job is interrupted.
+   */
+  @Override
+  public void setup(Context context) throws IOException, InterruptedException {
+    conf = context.getConfiguration();
+
+    syncFolders = conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false);
+    ignoreFailures = conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false);
+    skipCrc = conf.getBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(), false);
+    overWrite = conf.getBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(), false);
+    preserve = DistCpUtils.unpackAttributes(conf.get(DistCpOptionSwitch.
+        PRESERVE_STATUS.getConfigLabel()));
+
+    targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+    Path targetFinalPath = new Path(conf.get(
+            DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
+    targetFS = targetFinalPath.getFileSystem(conf);
+
+    if (targetFS.exists(targetFinalPath) && targetFS.isFile(targetFinalPath)) {
+      overWrite = true; // When target is an existing file, overwrite it.
+    }
+
+    if (conf.get(DistCpConstants.CONF_LABEL_SSL_CONF) != null) {
+      initializeSSLConf(context);
+    }
+  }
+
+  /**
+   * Initialize SSL Config if same is set in conf
+   *
+   * @throws IOException - If any
+   */
+  private void initializeSSLConf(Context context) throws IOException {
+    LOG.info("Initializing SSL configuration");
+    
+    String workDir = conf.get(JobContext.JOB_LOCAL_DIR) + "/work";
+    Path[] cacheFiles = context.getLocalCacheFiles();
+
+    Configuration sslConfig = new Configuration(false);
+    String sslConfFileName = conf.get(DistCpConstants.CONF_LABEL_SSL_CONF);
+    Path sslClient = findCacheFile(cacheFiles, sslConfFileName);
+    if (sslClient == null) {
+      LOG.warn("SSL Client config file not found. Was looking for " + sslConfFileName +
+          " in " + Arrays.toString(cacheFiles));
+      return;
+    }
+    sslConfig.addResource(sslClient);
+
+    String trustStoreFile = conf.get("ssl.client.truststore.location");
+    Path trustStorePath = findCacheFile(cacheFiles, trustStoreFile);
+    sslConfig.set("ssl.client.truststore.location", trustStorePath.toString());
+
+    String keyStoreFile = conf.get("ssl.client.keystore.location");
+    Path keyStorePath = findCacheFile(cacheFiles, keyStoreFile);
+    sslConfig.set("ssl.client.keystore.location", keyStorePath.toString());
+
+    try {
+      OutputStream out = new FileOutputStream(workDir + "/" + sslConfFileName);
+      try {
+        sslConfig.writeXml(out);
+      } finally {
+        out.close();
+      }
+      conf.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfFileName);
+    } catch (IOException e) {
+      LOG.warn("Unable to write out the ssl configuration. " +
+          "Will fall back to default ssl-client.xml in class path, if there is one", e);
+    }
+  }
+
+  /**
+   * Find entry from distributed cache
+   *
+   * @param cacheFiles - All localized cache files
+   * @param fileName - fileName to search
+   * @return Path of the filename if found, else null
+   */
+  private Path findCacheFile(Path[] cacheFiles, String fileName) {
+    if (cacheFiles != null && cacheFiles.length > 0) {
+      for (Path file : cacheFiles) {
+        if (file.getName().equals(fileName)) {
+          return file;
+        }
+      }
+    }
+    return null;
+  }
+
+  /**
+   * Implementation of the Mapper<>::map(). Does the copy.
+   * @param relPath The target path.
+   * @param sourceFileStatus The source path.
+   * @throws IOException
+   */
+  @Override
+  public void map(Text relPath, FileStatus sourceFileStatus, Context context)
+          throws IOException, InterruptedException {
+    Path sourcePath = sourceFileStatus.getPath();
+
+    if (LOG.isDebugEnabled())
+      LOG.debug("DistCpMapper::map(): Received " + sourcePath + ", " + relPath);
+
+    Path target = new Path(targetWorkPath.makeQualified(targetFS.getUri(),
+                          targetFS.getWorkingDirectory()) + relPath.toString());
+
+    EnumSet<DistCpOptions.FileAttribute> fileAttributes
+            = getFileAttributeSettings(context);
+
+    final String description = "Copying " + sourcePath + " to " + target;
+    context.setStatus(description);
+
+    LOG.info(description);
+
+    try {
+      FileStatus sourceCurrStatus;
+      FileSystem sourceFS;
+      try {
+        sourceFS = sourcePath.getFileSystem(conf);
+        sourceCurrStatus = sourceFS.getFileStatus(sourcePath);
+      } catch (FileNotFoundException e) {
+        throw new IOException(new RetriableFileCopyCommand.CopyReadException(e));
+      }
+
+      FileStatus targetStatus = null;
+
+      try {
+        targetStatus = targetFS.getFileStatus(target);
+      } catch (FileNotFoundException ignore) {
+        if (LOG.isDebugEnabled())
+          LOG.debug("Path could not be found: " + target, ignore);
+      }
+
+      if (targetStatus != null && (targetStatus.isDirectory() != sourceCurrStatus.isDirectory())) {
+        throw new IOException("Can't replace " + target + ". Target is " +
+            getFileType(targetStatus) + ", Source is " + getFileType(sourceCurrStatus));
+      }
+
+      if (sourceCurrStatus.isDirectory()) {
+        createTargetDirsWithRetry(description, target, context);
+        return;
+      }
+
+      if (skipFile(sourceFS, sourceCurrStatus, target)) {
+        LOG.info("Skipping copy of " + sourceCurrStatus.getPath()
+                 + " to " + target);
+        updateSkipCounters(context, sourceCurrStatus);
+        context.write(null, new Text("SKIP: " + sourceCurrStatus.getPath()));
+      }
+      else {
+        copyFileWithRetry(description, sourceCurrStatus, target, context,
+                          fileAttributes);
+      }
+
+      DistCpUtils.preserve(target.getFileSystem(conf), target,
+                           sourceCurrStatus, fileAttributes);
+
+    } catch (IOException exception) {
+      handleFailures(exception, sourceFileStatus, target, context);
+    }
+  }
+
+  private String getFileType(FileStatus fileStatus) {
+    return fileStatus == null ? "N/A" : (fileStatus.isDirectory() ? "dir" : "file");
+  }
+
+  private static EnumSet<DistCpOptions.FileAttribute>
+          getFileAttributeSettings(Mapper.Context context) {
+    String attributeString = context.getConfiguration().get(
+            DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel());
+    return DistCpUtils.unpackAttributes(attributeString);
+  }
+
+  private void copyFileWithRetry(String description, FileStatus sourceFileStatus,
+               Path target, Context context,
+               EnumSet<DistCpOptions.FileAttribute> fileAttributes) throws IOException {
+
+    long bytesCopied;
+    try {
+      bytesCopied = (Long)new RetriableFileCopyCommand(description)
+                       .execute(sourceFileStatus, target, context, fileAttributes);
+    } catch (Exception e) {
+      context.setStatus("Copy Failure: " + sourceFileStatus.getPath());
+      throw new IOException("File copy failed: " + sourceFileStatus.getPath() +
+          " --> " + target, e);
+    }
+    incrementCounter(context, Counter.BYTESEXPECTED, sourceFileStatus.getLen());
+    incrementCounter(context, Counter.BYTESCOPIED, bytesCopied);
+    incrementCounter(context, Counter.COPY, 1);
+  }
+
+  private void createTargetDirsWithRetry(String description,
+                   Path target, Context context) throws IOException {
+    try {
+      new RetriableDirectoryCreateCommand(description).execute(target, context);
+    } catch (Exception e) {
+      throw new IOException("mkdir failed for " + target, e);
+    }
+    incrementCounter(context, Counter.COPY, 1);
+  }
+
+  private static void updateSkipCounters(Context context,
+                                         FileStatus sourceFile) {
+    incrementCounter(context, Counter.SKIP, 1);
+    incrementCounter(context, Counter.BYTESSKIPPED, sourceFile.getLen());
+
+  }
+
+  private void handleFailures(IOException exception,
+                                     FileStatus sourceFileStatus, Path target,
+                                     Context context) throws IOException, InterruptedException {
+    LOG.error("Failure in copying " + sourceFileStatus.getPath() + " to " +
+                target, exception);
+
+    if (ignoreFailures && exception.getCause() instanceof
+            RetriableFileCopyCommand.CopyReadException) {
+      incrementCounter(context, Counter.FAIL, 1);
+      incrementCounter(context, Counter.BYTESFAILED, sourceFileStatus.getLen());
+      context.write(null, new Text("FAIL: " + sourceFileStatus.getPath() + " - " + 
+          StringUtils.stringifyException(exception)));
+    }
+    else
+      throw exception;
+  }
+
+  private static void incrementCounter(Context context, Counter counter,
+                                       long value) {
+    context.getCounter(counter).increment(value);
+  }
+
+  private boolean skipFile(FileSystem sourceFS, FileStatus source, Path target)
+                                          throws IOException {
+    return     targetFS.exists(target)
+            && !overWrite
+            && !mustUpdate(sourceFS, source, target);
+  }
+
+  private boolean mustUpdate(FileSystem sourceFS, FileStatus source, Path target)
+                                    throws IOException {
+    final FileStatus targetFileStatus = targetFS.getFileStatus(target);
+
+    return     syncFolders
+            && (
+                   targetFileStatus.getLen() != source.getLen()
+                || (!skipCrc &&
+                       !DistCpUtils.checksumsAreEqual(sourceFS,
+                                          source.getPath(), targetFS, target))
+                || (source.getBlockSize() != targetFileStatus.getBlockSize() &&
+                      preserve.contains(FileAttribute.BLOCKSIZE))
+               );
+  }
+}

+ 124 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyOutputFormat.java

@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.security.TokenCache;
+import org.apache.hadoop.tools.DistCpConstants;
+
+import java.io.IOException;
+
+/**
+ * The CopyOutputFormat is the Hadoop OutputFormat used in DistCp.
+ * It sets up the Job's Configuration (in the Job-Context) with the settings
+ * for the work-directory, final commit-directory, etc. It also sets the right
+ * output-committer.
+ * @param <K>
+ * @param <V>
+ */
+public class CopyOutputFormat<K, V> extends TextOutputFormat<K, V> {
+
+  /**
+   * Setter for the working directory for DistCp (where files will be copied
+   * before they are moved to the final commit-directory.)
+   * @param job The Job on whose configuration the working-directory is to be set.
+   * @param workingDirectory The path to use as the working directory.
+   */
+  public static void setWorkingDirectory(Job job, Path workingDirectory) {
+    job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH,
+        workingDirectory.toString());
+  }
+
+  /**
+   * Setter for the final directory for DistCp (where files copied will be
+   * moved, atomically.)
+   * @param job The Job on whose configuration the working-directory is to be set.
+   * @param commitDirectory The path to use for final commit.
+   */
+  public static void setCommitDirectory(Job job, Path commitDirectory) {
+    job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH,
+        commitDirectory.toString());
+  }
+
+  /**
+   * Getter for the working directory.
+   * @param job The Job from whose configuration the working-directory is to
+   * be retrieved.
+   * @return The working-directory Path.
+   */
+  public static Path getWorkingDirectory(Job job) {
+    return getWorkingDirectory(job.getConfiguration());
+  }
+
+  private static Path getWorkingDirectory(Configuration conf) {
+    String workingDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH);
+    if (workingDirectory == null || workingDirectory.isEmpty()) {
+      return null;
+    } else {
+      return new Path(workingDirectory);
+    }
+  }
+
+  /**
+   * Getter for the final commit-directory.
+   * @param job The Job from whose configuration the commit-directory is to be
+   * retrieved.
+   * @return The commit-directory Path.
+   */
+  public static Path getCommitDirectory(Job job) {
+    return getCommitDirectory(job.getConfiguration());
+  }
+
+  private static Path getCommitDirectory(Configuration conf) {
+    String commitDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH);
+    if (commitDirectory == null || commitDirectory.isEmpty()) {
+      return null;
+    } else {
+      return new Path(commitDirectory);
+    }
+  }
+
+  /** @inheritDoc */
+  @Override
+  public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
+    return new CopyCommitter(getOutputPath(context), context);
+  }
+
+  /** @inheritDoc */
+  @Override
+  public void checkOutputSpecs(JobContext context) throws IOException {
+    Configuration conf = context.getConfiguration();
+
+    if (getCommitDirectory(conf) == null) {
+      throw new IllegalStateException("Commit directory not configured");
+    }
+
+    Path workingPath = getWorkingDirectory(conf);
+    if (workingPath == null) {
+      throw new IllegalStateException("Working directory not configured");
+    }
+
+    // get delegation token for outDir's file system
+    TokenCache.obtainTokensForNamenodes(context.getCredentials(),
+                                        new Path[] {workingPath}, conf);
+  }
+}

+ 56 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableDirectoryCreateCommand.java

@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.hadoop.tools.util.RetriableCommand;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.mapreduce.Mapper;
+
+/**
+ * This class extends Retriable command to implement the creation of directories
+ * with retries on failure.
+ */
+public class RetriableDirectoryCreateCommand extends RetriableCommand {
+
+  /**
+   * Constructor, taking a description of the action.
+   * @param description Verbose description of the copy operation.
+   */
+  public RetriableDirectoryCreateCommand(String description) {
+    super(description);
+  }
+
+  /**
+   * Implementation of RetriableCommand::doExecute().
+   * This implements the actual mkdirs() functionality.
+   * @param arguments Argument-list to the command.
+   * @return Boolean. True, if the directory could be created successfully.
+   * @throws Exception IOException, on failure to create the directory.
+   */
+  @Override
+  protected Object doExecute(Object... arguments) throws Exception {
+    assert arguments.length == 2 : "Unexpected argument list.";
+    Path target = (Path)arguments[0];
+    Mapper.Context context = (Mapper.Context)arguments[1];
+
+    FileSystem targetFS = target.getFileSystem(context.getConfiguration());
+    return targetFS.mkdirs(target);
+  }
+}

+ 245 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java

@@ -0,0 +1,245 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.hadoop.tools.util.RetriableCommand;
+import org.apache.hadoop.tools.util.ThrottledInputStream;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.tools.DistCpOptions.*;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.io.*;
+import java.util.EnumSet;
+
+/**
+ * This class extends RetriableCommand to implement the copy of files,
+ * with retries on failure.
+ */
+public class RetriableFileCopyCommand extends RetriableCommand {
+
+  private static Log LOG = LogFactory.getLog(RetriableFileCopyCommand.class);
+  private static int BUFFER_SIZE = 8 * 1024;
+
+  /**
+   * Constructor, taking a description of the action.
+   * @param description Verbose description of the copy operation.
+   */
+  public RetriableFileCopyCommand(String description) {
+    super(description);
+  }
+
+  /**
+   * Implementation of RetriableCommand::doExecute().
+   * This is the actual copy-implementation.
+   * @param arguments Argument-list to the command.
+   * @return Number of bytes copied.
+   * @throws Exception: CopyReadException, if there are read-failures. All other
+   *         failures are IOExceptions.
+   */
+  @SuppressWarnings("unchecked")
+  @Override
+  protected Object doExecute(Object... arguments) throws Exception {
+    assert arguments.length == 4 : "Unexpected argument list.";
+    FileStatus source = (FileStatus)arguments[0];
+    assert !source.isDirectory() : "Unexpected file-status. Expected file.";
+    Path target = (Path)arguments[1];
+    Mapper.Context context = (Mapper.Context)arguments[2];
+    EnumSet<FileAttribute> fileAttributes
+            = (EnumSet<FileAttribute>)arguments[3];
+    return doCopy(source, target, context, fileAttributes);
+  }
+
+  private long doCopy(FileStatus sourceFileStatus, Path target,
+                      Mapper.Context context,
+                      EnumSet<FileAttribute> fileAttributes)
+          throws IOException {
+
+    Path tmpTargetPath = getTmpFile(target, context);
+    final Configuration configuration = context.getConfiguration();
+    FileSystem targetFS = target.getFileSystem(configuration);
+
+    try {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Copying " + sourceFileStatus.getPath() + " to " + target);
+        LOG.debug("Tmp-file path: " + tmpTargetPath);
+      }
+      FileSystem sourceFS = sourceFileStatus.getPath().getFileSystem(
+              configuration);
+      long bytesRead = copyToTmpFile(tmpTargetPath, targetFS, sourceFileStatus,
+                                     context, fileAttributes);
+
+      compareFileLengths(sourceFileStatus, tmpTargetPath, configuration, bytesRead);
+      compareCheckSums(sourceFS, sourceFileStatus.getPath(), targetFS, tmpTargetPath);
+      promoteTmpToTarget(tmpTargetPath, target, targetFS);
+      return bytesRead;
+
+    } finally {
+      if (targetFS.exists(tmpTargetPath))
+        targetFS.delete(tmpTargetPath, false);
+    }
+  }
+
+  private long copyToTmpFile(Path tmpTargetPath, FileSystem targetFS,
+                             FileStatus sourceFileStatus, Mapper.Context context,
+                             EnumSet<FileAttribute> fileAttributes)
+                             throws IOException {
+    OutputStream outStream = new BufferedOutputStream(targetFS.create(
+            tmpTargetPath, true, BUFFER_SIZE,
+            getReplicationFactor(fileAttributes, sourceFileStatus, targetFS),
+            getBlockSize(fileAttributes, sourceFileStatus, targetFS), context));
+    return copyBytes(sourceFileStatus, outStream, BUFFER_SIZE, true, context);
+  }
+
+  private void compareFileLengths(FileStatus sourceFileStatus, Path target,
+                                  Configuration configuration, long bytesRead)
+                                  throws IOException {
+    final Path sourcePath = sourceFileStatus.getPath();
+    FileSystem fs = sourcePath.getFileSystem(configuration);
+    if (fs.getFileStatus(sourcePath).getLen() != bytesRead)
+      throw new IOException("Mismatch in length of source:" + sourcePath
+                + " and target:" + target);
+  }
+
+  private void compareCheckSums(FileSystem sourceFS, Path source,
+                                FileSystem targetFS, Path target)
+                                throws IOException {
+    if (!DistCpUtils.checksumsAreEqual(sourceFS, source, targetFS, target))
+      throw new IOException("Check-sum mismatch between "
+                              + source + " and " + target);
+
+  }
+
+  //If target file exists and unable to delete target - fail
+  //If target doesn't exist and unable to create parent folder - fail
+  //If target is successfully deleted and parent exists, if rename fails - fail
+  private void promoteTmpToTarget(Path tmpTarget, Path target, FileSystem fs)
+                                  throws IOException {
+    if ((fs.exists(target) && !fs.delete(target, false))
+        || (!fs.exists(target.getParent()) && !fs.mkdirs(target.getParent()))
+        || !fs.rename(tmpTarget, target)) {
+      throw new IOException("Failed to promote tmp-file:" + tmpTarget
+                              + " to: " + target);
+    }
+  }
+
+  private Path getTmpFile(Path target, Mapper.Context context) {
+    Path targetWorkPath = new Path(context.getConfiguration().
+        get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+
+    Path root = target.equals(targetWorkPath)? targetWorkPath.getParent() : targetWorkPath;
+    LOG.info("Creating temp file: " +
+        new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString()));
+    return new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString());
+  }
+
+  private long copyBytes(FileStatus sourceFileStatus, OutputStream outStream,
+                         int bufferSize, boolean mustCloseStream,
+                         Mapper.Context context) throws IOException {
+    Path source = sourceFileStatus.getPath();
+    byte buf[] = new byte[bufferSize];
+    ThrottledInputStream inStream = null;
+    long totalBytesRead = 0;
+
+    try {
+      inStream = getInputStream(source, context.getConfiguration());
+      int bytesRead = readBytes(inStream, buf);
+      while (bytesRead >= 0) {
+        totalBytesRead += bytesRead;
+        outStream.write(buf, 0, bytesRead);
+        updateContextStatus(totalBytesRead, context, sourceFileStatus);
+        bytesRead = inStream.read(buf);
+      }
+    } finally {
+      if (mustCloseStream)
+        IOUtils.cleanup(LOG, outStream, inStream);
+    }
+
+    return totalBytesRead;
+  }
+
+  private void updateContextStatus(long totalBytesRead, Mapper.Context context,
+                                   FileStatus sourceFileStatus) {
+    StringBuilder message = new StringBuilder(DistCpUtils.getFormatter()
+                .format(totalBytesRead * 100.0f / sourceFileStatus.getLen()));
+    message.append("% ")
+            .append(description).append(" [")
+            .append(DistCpUtils.getStringDescriptionFor(totalBytesRead))
+            .append('/')
+        .append(DistCpUtils.getStringDescriptionFor(sourceFileStatus.getLen()))
+            .append(']');
+    context.setStatus(message.toString());
+  }
+
+  private static int readBytes(InputStream inStream, byte buf[])
+          throws IOException {
+    try {
+      return inStream.read(buf);
+    }
+    catch (IOException e) {
+      throw new CopyReadException(e);
+    }
+  }
+
+  private static ThrottledInputStream getInputStream(Path path, Configuration conf)
+          throws IOException {
+    try {
+      FileSystem fs = path.getFileSystem(conf);
+      long bandwidthMB = conf.getInt(DistCpConstants.CONF_LABEL_BANDWIDTH_MB,
+              DistCpConstants.DEFAULT_BANDWIDTH_MB);
+      return new ThrottledInputStream(new BufferedInputStream(fs.open(path)),
+              bandwidthMB * 1024 * 1024);
+    }
+    catch (IOException e) {
+      throw new CopyReadException(e);
+    }
+  }
+
+  private static short getReplicationFactor(
+          EnumSet<FileAttribute> fileAttributes,
+          FileStatus sourceFile, FileSystem targetFS) {
+    return fileAttributes.contains(FileAttribute.REPLICATION)?
+            sourceFile.getReplication() : targetFS.getDefaultReplication();
+  }
+
+  private static long getBlockSize(
+          EnumSet<FileAttribute> fileAttributes,
+          FileStatus sourceFile, FileSystem targetFS) {
+    return fileAttributes.contains(FileAttribute.BLOCKSIZE)?
+            sourceFile.getBlockSize() : targetFS.getDefaultBlockSize();
+  }
+
+  /**
+   * Special subclass of IOException. This is used to distinguish read-operation
+   * failures from other kinds of IOExceptions.
+   * The failure to read from source is dealt with specially, in the CopyMapper.
+   * Such failures may be skipped if the DistCpOptions indicate so.
+   * Write failures are intolerable, and amount to CopyMapper failure.  
+   */
+  public static class CopyReadException extends IOException {
+    public CopyReadException(Throwable rootCause) {
+      super(rootCause);
+    }
+  }
+}

+ 169 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/UniformSizeInputFormat.java

@@ -0,0 +1,169 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.conf.Configuration;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * UniformSizeInputFormat extends the InputFormat<> class, to produce
+ * input-splits for DistCp.
+ * It looks at the copy-listing and groups the contents into input-splits such
+ * that the total-number of bytes to be copied for each input split is
+ * uniform.
+ */
+public class UniformSizeInputFormat extends InputFormat<Text, FileStatus> {
+  private static final Log LOG
+                = LogFactory.getLog(UniformSizeInputFormat.class);
+
+  /**
+   * Implementation of InputFormat::getSplits(). Returns a list of InputSplits,
+   * such that the number of bytes to be copied for all the splits are
+   * approximately equal.
+   * @param context JobContext for the job.
+   * @return The list of uniformly-distributed input-splits.
+   * @throws IOException: On failure.
+   * @throws InterruptedException
+   */
+  @Override
+  public List<InputSplit> getSplits(JobContext context)
+                      throws IOException, InterruptedException {
+    Configuration configuration = context.getConfiguration();
+    int numSplits = DistCpUtils.getInt(configuration,
+                                       JobContext.NUM_MAPS);
+
+    if (numSplits == 0) return new ArrayList<InputSplit>();
+
+    return getSplits(configuration, numSplits,
+                     DistCpUtils.getLong(configuration,
+                          DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED));
+  }
+
+  private List<InputSplit> getSplits(Configuration configuration, int numSplits,
+                                     long totalSizeBytes) throws IOException {
+    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
+    long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);
+
+    FileStatus srcFileStatus = new FileStatus();
+    Text srcRelPath = new Text();
+    long currentSplitSize = 0;
+    long lastSplitStart = 0;
+    long lastPosition = 0;
+
+    final Path listingFilePath = getListingFilePath(configuration);
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Average bytes per map: " + nBytesPerSplit +
+          ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes);
+    }
+    SequenceFile.Reader reader=null;
+    try {
+      reader = getListingFileReader(configuration);
+      while (reader.next(srcRelPath, srcFileStatus)) {
+        // If adding the current file would cause the bytes per map to exceed
+        // limit. Add the current file to new split
+        if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
+          FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
+              lastPosition - lastSplitStart, null);
+          if (LOG.isDebugEnabled()) {
+            LOG.debug ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
+          }
+          splits.add(split);
+          lastSplitStart = lastPosition;
+          currentSplitSize = 0;
+        }
+        currentSplitSize += srcFileStatus.getLen();
+        lastPosition = reader.getPosition();
+      }
+      if (lastPosition > lastSplitStart) {
+        FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
+            lastPosition - lastSplitStart, null);
+        if (LOG.isDebugEnabled()) {
+          LOG.info ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
+        }
+        splits.add(split);
+      }
+
+    } finally {
+      IOUtils.closeStream(reader);
+    }
+
+    return splits;
+  }
+
+  private static Path getListingFilePath(Configuration configuration) {
+    final String listingFilePathString =
+            configuration.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
+
+    assert !listingFilePathString.equals("")
+              : "Couldn't find listing file. Invalid input.";
+    return new Path(listingFilePathString);
+  }
+
+  private SequenceFile.Reader getListingFileReader(Configuration configuration) {
+
+    final Path listingFilePath = getListingFilePath(configuration);
+    try {
+      final FileSystem fileSystem = listingFilePath.getFileSystem(configuration);
+      if (!fileSystem.exists(listingFilePath))
+        throw new IllegalArgumentException("Listing file doesn't exist at: "
+                                           + listingFilePath);
+
+      return new SequenceFile.Reader(configuration,
+                                     SequenceFile.Reader.file(listingFilePath));
+    }
+    catch (IOException exception) {
+      LOG.error("Couldn't find listing file at: " + listingFilePath, exception);
+      throw new IllegalArgumentException("Couldn't find listing-file at: "
+                                         + listingFilePath, exception);
+    }
+  }
+
+  /**
+   * Implementation of InputFormat::createRecordReader().
+   * @param split The split for which the RecordReader is sought.
+   * @param context The context of the current task-attempt.
+   * @return A SequenceFileRecordReader instance, (since the copy-listing is a
+   * simple sequence-file.)
+   * @throws IOException
+   * @throws InterruptedException
+   */
+  @Override
+  public RecordReader<Text, FileStatus> createRecordReader(InputSplit split,
+                                                     TaskAttemptContext context)
+                                      throws IOException, InterruptedException {
+    return new SequenceFileRecordReader<Text, FileStatus>();
+  }
+}

+ 246 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputChunk.java

@@ -0,0 +1,246 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.tools.mapred.lib;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskID;
+
+import java.io.IOException;
+
+/**
+ * The DynamicInputChunk represents a single chunk of work, when used in
+ * conjunction with the DynamicInputFormat and the DynamicRecordReader.
+ * The records in the DynamicInputFormat's input-file are split across various
+ * DynamicInputChunks. Each one is claimed and processed in an iteration of
+ * a dynamic-mapper. When a DynamicInputChunk has been exhausted, the faster
+ * mapper may claim another and process it, until there are no more to be
+ * consumed.
+ */
+class DynamicInputChunk<K, V> {
+  private static Log LOG = LogFactory.getLog(DynamicInputChunk.class);
+
+  private static Configuration configuration;
+  private static Path chunkRootPath;
+  private static String chunkFilePrefix;
+  private static int numChunksLeft = -1; // Un-initialized before 1st dir-scan.
+  private static FileSystem fs;
+
+  private Path chunkFilePath;
+  private SequenceFileRecordReader<K, V> reader;
+  private SequenceFile.Writer writer;
+
+  private static void initializeChunkInvariants(Configuration config)
+                                                  throws IOException {
+    configuration = config;
+    Path listingFilePath = new Path(getListingFilePath(configuration));
+    chunkRootPath = new Path(listingFilePath.getParent(), "chunkDir");
+    fs = chunkRootPath.getFileSystem(configuration);
+    chunkFilePrefix = listingFilePath.getName() + ".chunk.";
+  }
+
+  private static String getListingFilePath(Configuration configuration) {
+    final String listingFileString = configuration.get(
+            DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
+    assert !listingFileString.equals("") : "Listing file not found.";
+    return listingFileString;
+  }
+
+  private static boolean areInvariantsInitialized() {
+    return chunkRootPath != null;
+  }
+
+  private DynamicInputChunk(String chunkId, Configuration configuration)
+                                                      throws IOException {
+    if (!areInvariantsInitialized())
+      initializeChunkInvariants(configuration);
+
+    chunkFilePath = new Path(chunkRootPath, chunkFilePrefix + chunkId);
+    openForWrite();
+  }
+
+
+  private void openForWrite() throws IOException {
+    writer = SequenceFile.createWriter(
+            chunkFilePath.getFileSystem(configuration), configuration,
+            chunkFilePath, Text.class, FileStatus.class,
+            SequenceFile.CompressionType.NONE);
+
+  }
+
+  /**
+   * Factory method to create chunk-files for writing to.
+   * (For instance, when the DynamicInputFormat splits the input-file into
+   * chunks.)
+   * @param chunkId String to identify the chunk.
+   * @param configuration Configuration, describing the location of the listing-
+   * file, file-system for the map-job, etc.
+   * @return A DynamicInputChunk, corresponding to a chunk-file, with the name
+   * incorporating the chunk-id.
+   * @throws IOException Exception on failure to create the chunk.
+   */
+  public static DynamicInputChunk createChunkForWrite(String chunkId,
+                          Configuration configuration) throws IOException {
+    return new DynamicInputChunk(chunkId, configuration);
+  }
+
+  /**
+   * Method to write records into a chunk.
+   * @param key Key from the listing file.
+   * @param value Corresponding value from the listing file.
+   * @throws IOException Exception onf failure to write to the file.
+   */
+  public void write(Text key, FileStatus value) throws IOException {
+    writer.append(key, value);
+  }
+
+  /**
+   * Closes streams opened to the chunk-file.
+   */
+  public void close() {
+    IOUtils.cleanup(LOG, reader, writer);
+  }
+
+  /**
+   * Reassigns the chunk to a specified Map-Task, for consumption.
+   * @param taskId The Map-Task to which a the chunk is to be reassigned.
+   * @throws IOException Exception on failure to reassign.
+   */
+  public void assignTo(TaskID taskId) throws IOException {
+    Path newPath = new Path(chunkRootPath, taskId.toString());
+    if (!fs.rename(chunkFilePath, newPath)) {
+      LOG.warn(chunkFilePath + " could not be assigned to " + taskId);
+    }
+  }
+
+  private DynamicInputChunk(Path chunkFilePath,
+                            TaskAttemptContext taskAttemptContext)
+                                   throws IOException, InterruptedException {
+    if (!areInvariantsInitialized())
+      initializeChunkInvariants(taskAttemptContext.getConfiguration());
+
+    this.chunkFilePath = chunkFilePath;
+    openForRead(taskAttemptContext);
+  }
+
+  private void openForRead(TaskAttemptContext taskAttemptContext)
+          throws IOException, InterruptedException {
+    reader = new SequenceFileRecordReader<K, V>();
+    reader.initialize(new FileSplit(chunkFilePath, 0,
+            DistCpUtils.getFileSize(chunkFilePath, configuration), null),
+            taskAttemptContext);
+  }
+
+  /**
+   * Factory method that
+   * 1. acquires a chunk for the specified map-task attempt
+   * 2. returns a DynamicInputChunk associated with the acquired chunk-file.
+   * @param taskAttemptContext The attempt-context for the map task that's
+   * trying to acquire a chunk.
+   * @return The acquired dynamic-chunk. The chunk-file is renamed to the
+   * attempt-id (from the attempt-context.)
+   * @throws IOException Exception on failure.
+   * @throws InterruptedException Exception on failure.
+   */
+  public static DynamicInputChunk acquire(TaskAttemptContext taskAttemptContext)
+                                      throws IOException, InterruptedException {
+    if (!areInvariantsInitialized())
+        initializeChunkInvariants(taskAttemptContext.getConfiguration());
+
+    String taskId
+            = taskAttemptContext.getTaskAttemptID().getTaskID().toString();
+    Path acquiredFilePath = new Path(chunkRootPath, taskId);
+
+    if (fs.exists(acquiredFilePath)) {
+      LOG.info("Acquiring pre-assigned chunk: " + acquiredFilePath);
+      return new DynamicInputChunk(acquiredFilePath, taskAttemptContext);
+    }
+
+    for (FileStatus chunkFile : getListOfChunkFiles()) {
+      if (fs.rename(chunkFile.getPath(), acquiredFilePath)) {
+        LOG.info(taskId + " acquired " + chunkFile.getPath());
+        return new DynamicInputChunk(acquiredFilePath, taskAttemptContext);
+      }
+      else
+        LOG.warn(taskId + " could not acquire " + chunkFile.getPath());
+    }
+
+    return null;
+  }
+
+  /**
+   * Method to be called to relinquish an acquired chunk. All streams open to
+   * the chunk are closed, and the chunk-file is deleted.
+   * @throws IOException Exception thrown on failure to release (i.e. delete)
+   * the chunk file.
+   */
+  public void release() throws IOException {
+    close();
+    if (!fs.delete(chunkFilePath, false)) {
+      LOG.error("Unable to release chunk at path: " + chunkFilePath);
+      throw new IOException("Unable to release chunk at path: " + chunkFilePath);
+    }
+  }
+
+  static FileStatus [] getListOfChunkFiles() throws IOException {
+    Path chunkFilePattern = new Path(chunkRootPath, chunkFilePrefix + "*");
+    FileStatus chunkFiles[] = fs.globStatus(chunkFilePattern);
+    numChunksLeft = chunkFiles.length;
+    return chunkFiles;
+  }
+
+  /**
+   * Getter for the chunk-file's path, on HDFS.
+   * @return The qualified path to the chunk-file.
+   */
+  public Path getPath() {
+    return chunkFilePath;
+  }
+
+  /**
+   * Getter for the record-reader, opened to the chunk-file.
+   * @return Opened Sequence-file reader.
+   */
+  public SequenceFileRecordReader<K,V> getReader() {
+    assert reader != null : "Reader un-initialized!";
+    return reader;
+  }
+
+  /**
+   * Getter for the number of chunk-files left in the chunk-file directory.
+   * Useful to determine how many chunks (and hence, records) are left to be
+   * processed.
+   * @return Before the first scan of the directory, the number returned is -1.
+   * Otherwise, the number of chunk-files seen from the last scan is returned.
+   */
+  public static int getNumChunksLeft() {
+    return numChunksLeft;
+  }
+}

+ 292 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputFormat.java

@@ -0,0 +1,292 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred.lib;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileStatus;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.io.IOException;
+
+/**
+ * DynamicInputFormat implements the "Worker pattern" for DistCp.
+ * Rather than to split up the copy-list into a set of static splits,
+ * the DynamicInputFormat does the following:
+ * 1. Splits the copy-list into small chunks on the DFS.
+ * 2. Creates a set of empty "dynamic" splits, that each consume as many chunks
+ *    as it can.
+ * This arrangement ensures that a single slow mapper won't slow down the entire
+ * job (since the slack will be picked up by other mappers, who consume more
+ * chunks.)
+ * By varying the split-ratio, one can vary chunk sizes to achieve different
+ * performance characteristics. 
+ */
+public class DynamicInputFormat<K, V> extends InputFormat<K, V> {
+  private static final Log LOG = LogFactory.getLog(DynamicInputFormat.class);
+
+  private static final String CONF_LABEL_LISTING_SPLIT_RATIO
+          = "mapred.listing.split.ratio";
+  private static final String CONF_LABEL_NUM_SPLITS
+          = "mapred.num.splits";
+  private static final String CONF_LABEL_NUM_ENTRIES_PER_CHUNK
+          = "mapred.num.entries.per.chunk";
+
+  /**
+   * Implementation of InputFormat::getSplits(). This method splits up the
+   * copy-listing file into chunks, and assigns the first batch to different
+   * tasks.
+   * @param jobContext JobContext for the map job.
+   * @return The list of (empty) dynamic input-splits.
+   * @throws IOException, on failure.
+   * @throws InterruptedException
+   */
+  @Override
+  public List<InputSplit> getSplits(JobContext jobContext)
+      throws IOException, InterruptedException {
+    LOG.info("DynamicInputFormat: Getting splits for job:"
+             + jobContext.getJobID());
+    return createSplits(jobContext,
+                        splitCopyListingIntoChunksWithShuffle(jobContext));
+  }
+
+  private List<InputSplit> createSplits(JobContext jobContext,
+                                        List<DynamicInputChunk> chunks)
+          throws IOException {
+    int numMaps = getNumMapTasks(jobContext.getConfiguration());
+
+    final int nSplits = Math.min(numMaps, chunks.size());
+    List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);
+    
+    for (int i=0; i< nSplits; ++i) {
+      TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
+      chunks.get(i).assignTo(taskId);
+      splits.add(new FileSplit(chunks.get(i).getPath(), 0,
+          // Setting non-zero length for FileSplit size, to avoid a possible
+          // future when 0-sized file-splits are considered "empty" and skipped
+          // over.
+          MIN_RECORDS_PER_CHUNK,
+          null));
+    }
+    DistCpUtils.publish(jobContext.getConfiguration(),
+                        CONF_LABEL_NUM_SPLITS, splits.size());
+    return splits;
+  }
+
+  private static int N_CHUNKS_OPEN_AT_ONCE_DEFAULT = 16;
+
+  private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle
+                                    (JobContext context) throws IOException {
+
+    final Configuration configuration = context.getConfiguration();
+    int numRecords = getNumberOfRecords(configuration);
+    int numMaps = getNumMapTasks(configuration);
+    // Number of chunks each map will process, on average.
+    int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
+    validateNumChunksUsing(splitRatio, numMaps);
+
+    int numEntriesPerChunk = (int)Math.ceil((float)numRecords
+                                          /(splitRatio * numMaps));
+    DistCpUtils.publish(context.getConfiguration(),
+                        CONF_LABEL_NUM_ENTRIES_PER_CHUNK,
+                        numEntriesPerChunk);
+
+    final int nChunksTotal = (int)Math.ceil((float)numRecords/numEntriesPerChunk);
+    int nChunksOpenAtOnce
+            = Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);
+
+    Path listingPath = getListingFilePath(configuration);
+    SequenceFile.Reader reader
+            = new SequenceFile.Reader(configuration,
+                                      SequenceFile.Reader.file(listingPath));
+
+    List<DynamicInputChunk> openChunks
+                  = new ArrayList<DynamicInputChunk>();
+    
+    List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();
+
+    FileStatus fileStatus = new FileStatus();
+    Text relPath = new Text();
+    int recordCounter = 0;
+    int chunkCount = 0;
+
+    try {
+
+      while (reader.next(relPath, fileStatus)) {
+        if (recordCounter % (nChunksOpenAtOnce*numEntriesPerChunk) == 0) {
+          // All chunks full. Create new chunk-set.
+          closeAll(openChunks);
+          chunksFinal.addAll(openChunks);
+
+          openChunks = createChunks(
+                  configuration, chunkCount, nChunksTotal, nChunksOpenAtOnce);
+
+          chunkCount += openChunks.size();
+
+          nChunksOpenAtOnce = openChunks.size();
+          recordCounter = 0;
+        }
+
+        // Shuffle into open chunks.
+        openChunks.get(recordCounter%nChunksOpenAtOnce).write(relPath, fileStatus);
+        ++recordCounter;
+      }
+
+    } finally {
+      closeAll(openChunks);
+      chunksFinal.addAll(openChunks);
+      IOUtils.closeStream(reader);
+    }
+
+    LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size()); 
+    return chunksFinal;
+  }
+
+  private static void validateNumChunksUsing(int splitRatio, int numMaps)
+                                              throws IOException {
+    if (splitRatio * numMaps > MAX_CHUNKS_TOLERABLE)
+      throw new IOException("Too many chunks created with splitRatio:"
+                 + splitRatio + ", numMaps:" + numMaps
+                 + ". Reduce numMaps or decrease split-ratio to proceed.");
+  }
+
+  private static void closeAll(List<DynamicInputChunk> chunks) {
+    for (DynamicInputChunk chunk: chunks)
+      chunk.close();
+  }
+
+  private static List<DynamicInputChunk> createChunks(Configuration config,
+                      int chunkCount, int nChunksTotal, int nChunksOpenAtOnce)
+                                          throws IOException {
+    List<DynamicInputChunk> chunks = new ArrayList<DynamicInputChunk>();
+    int chunkIdUpperBound
+            = Math.min(nChunksTotal, chunkCount + nChunksOpenAtOnce);
+
+    // If there will be fewer than nChunksOpenAtOnce chunks left after
+    // the current batch of chunks, fold the remaining chunks into
+    // the current batch.
+    if (nChunksTotal - chunkIdUpperBound < nChunksOpenAtOnce)
+      chunkIdUpperBound = nChunksTotal;
+
+    for (int i=chunkCount; i < chunkIdUpperBound; ++i)
+      chunks.add(createChunk(i, config));
+    return chunks;
+  }
+
+  private static DynamicInputChunk createChunk(int chunkId, Configuration config)
+                                              throws IOException {
+    return DynamicInputChunk.createChunkForWrite(String.format("%05d", chunkId),
+                                              config);
+  }
+
+
+  private static Path getListingFilePath(Configuration configuration) {
+    String listingFilePathString = configuration.get(
+            DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
+
+    assert !listingFilePathString.equals("") : "Listing file not found.";
+
+    Path listingFilePath = new Path(listingFilePathString);
+    try {
+      assert listingFilePath.getFileSystem(configuration)
+              .exists(listingFilePath) : "Listing file: " + listingFilePath +
+                                          " not found.";
+    } catch (IOException e) {
+      assert false :   "Listing file: " + listingFilePath
+                    + " couldn't be accessed. " + e.getMessage();
+    }
+    return listingFilePath;
+  }
+
+  private static int getNumberOfRecords(Configuration configuration) {
+    return DistCpUtils.getInt(configuration,
+                              DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS);
+  }
+
+  private static int getNumMapTasks(Configuration configuration) {
+    return DistCpUtils.getInt(configuration,
+                              JobContext.NUM_MAPS);
+  }
+
+  private static int getListingSplitRatio(Configuration configuration,
+                                            int numMaps, int numPaths) {
+    return configuration.getInt(
+            CONF_LABEL_LISTING_SPLIT_RATIO,
+            getSplitRatio(numMaps, numPaths));
+  }
+
+  private static final int MAX_CHUNKS_TOLERABLE = 400;
+  private static final int MAX_CHUNKS_IDEAL     = 100;
+  private static final int MIN_RECORDS_PER_CHUNK = 5;
+  private static final int SPLIT_RATIO_DEFAULT  = 2;
+
+  /**
+   * Package private, for testability.
+   * @param nMaps The number of maps requested for.
+   * @param nRecords The number of records to be copied.
+   * @return The number of splits each map should handle, ideally.
+   */
+  static int getSplitRatio(int nMaps, int nRecords) {
+    if (nMaps == 1) {
+      LOG.warn("nMaps == 1. Why use DynamicInputFormat?");
+      return 1;
+    }
+
+    if (nMaps > MAX_CHUNKS_IDEAL)
+      return SPLIT_RATIO_DEFAULT;
+
+    int nPickups = (int)Math.ceil((float)MAX_CHUNKS_IDEAL/nMaps);
+    int nRecordsPerChunk = (int)Math.ceil((float)nRecords/(nMaps*nPickups));
+
+    return nRecordsPerChunk < MIN_RECORDS_PER_CHUNK ?
+              SPLIT_RATIO_DEFAULT : nPickups;
+  }
+
+  static int getNumEntriesPerChunk(Configuration configuration) {
+    return DistCpUtils.getInt(configuration,
+                              CONF_LABEL_NUM_ENTRIES_PER_CHUNK);
+  }
+
+
+  /**
+   * Implementation of Inputformat::createRecordReader().
+   * @param inputSplit The split for which the RecordReader is required.
+   * @param taskAttemptContext TaskAttemptContext for the current attempt.
+   * @return DynamicRecordReader instance.
+   * @throws IOException, on failure.
+   * @throws InterruptedException
+   */
+  @Override
+  public RecordReader<K, V> createRecordReader(
+          InputSplit inputSplit,
+          TaskAttemptContext taskAttemptContext)
+          throws IOException, InterruptedException {
+    return new DynamicRecordReader<K, V>();
+  }
+}

+ 203 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicRecordReader.java

@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred.lib;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.conf.Configuration;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * The DynamicRecordReader is used in conjunction with the DynamicInputFormat
+ * to implement the "Worker pattern" for DistCp.
+ * The DynamicRecordReader is responsible for:
+ * 1. Presenting the contents of each chunk to DistCp's mapper.
+ * 2. Acquiring a new chunk when the current chunk has been completely consumed,
+ *    transparently.
+ */
+public class DynamicRecordReader<K, V> extends RecordReader<K, V> {
+  private static final Log LOG = LogFactory.getLog(DynamicRecordReader.class);
+  private TaskAttemptContext taskAttemptContext;
+  private Configuration configuration;
+  private DynamicInputChunk<K, V> chunk;
+  private TaskID taskId;
+
+  // Data required for progress indication.
+  private int numRecordsPerChunk; // Constant per job.
+  private int totalNumRecords;    // Constant per job.
+  private int numRecordsProcessedByThisMap = 0;
+  private long timeOfLastChunkDirScan = 0;
+  private boolean isChunkDirAlreadyScanned = false;
+
+  private static long TIME_THRESHOLD_FOR_DIR_SCANS = TimeUnit.MINUTES.toMillis(5);
+
+  /**
+   * Implementation for RecordReader::initialize(). Initializes the internal
+   * RecordReader to read from chunks.
+   * @param inputSplit The InputSplit for the map. Ignored entirely.
+   * @param taskAttemptContext The AttemptContext.
+   * @throws IOException, on failure.
+   * @throws InterruptedException
+   */
+  @Override
+  public void initialize(InputSplit inputSplit,
+                         TaskAttemptContext taskAttemptContext)
+                         throws IOException, InterruptedException {
+    numRecordsPerChunk = DynamicInputFormat.getNumEntriesPerChunk(
+            taskAttemptContext.getConfiguration());
+    this.taskAttemptContext = taskAttemptContext;
+    configuration = taskAttemptContext.getConfiguration();
+    taskId = taskAttemptContext.getTaskAttemptID().getTaskID();
+    chunk = DynamicInputChunk.acquire(this.taskAttemptContext);
+    timeOfLastChunkDirScan = System.currentTimeMillis();
+    isChunkDirAlreadyScanned = false;
+
+    totalNumRecords = getTotalNumRecords();
+
+  }
+
+  private int getTotalNumRecords() {
+    return DistCpUtils.getInt(configuration,
+                              DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS);
+  }
+
+  /**
+   * Implementation of RecordReader::nextValue().
+   * Reads the contents of the current chunk and returns them. When a chunk has
+   * been completely exhausted, an new chunk is acquired and read,
+   * transparently.
+   * @return True, if the nextValue() could be traversed to. False, otherwise.
+   * @throws IOException, on failure.
+   * @throws InterruptedException
+   */
+  @Override
+  public boolean nextKeyValue()
+      throws IOException, InterruptedException {
+
+    if (chunk == null) {
+      if (LOG.isDebugEnabled())
+        LOG.debug(taskId + ": RecordReader is null. No records to be read.");
+      return false;
+    }
+
+    if (chunk.getReader().nextKeyValue()) {
+      ++numRecordsProcessedByThisMap;
+      return true;
+    }
+
+    if (LOG.isDebugEnabled())
+      LOG.debug(taskId + ": Current chunk exhausted. " +
+                         " Attempting to pick up new one.");
+
+    chunk.release();
+    timeOfLastChunkDirScan = System.currentTimeMillis();
+    isChunkDirAlreadyScanned = false;
+    
+    chunk = DynamicInputChunk.acquire(taskAttemptContext);
+
+    if (chunk == null) return false;
+
+    if (chunk.getReader().nextKeyValue()) {
+      ++numRecordsProcessedByThisMap;
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+
+  /**
+   * Implementation of RecordReader::getCurrentKey().
+   * @return The key of the current record. (i.e. the source-path.)
+   * @throws IOException, on failure.
+   * @throws InterruptedException
+   */
+  @Override
+  public K getCurrentKey()
+      throws IOException, InterruptedException {
+    return chunk.getReader().getCurrentKey();
+  }
+
+  /**
+   * Implementation of RecordReader::getCurrentValue().
+   * @return The value of the current record. (i.e. the target-path.)
+   * @throws IOException, on failure.
+   * @throws InterruptedException
+   */
+  @Override
+  public V getCurrentValue()
+      throws IOException, InterruptedException {
+    return chunk.getReader().getCurrentValue();
+  }
+
+  /**
+   * Implementation of RecordReader::getProgress().
+   * @return A fraction [0.0,1.0] indicating the progress of a DistCp mapper.
+   * @throws IOException, on failure.
+   * @throws InterruptedException
+   */
+  @Override
+  public float getProgress()
+      throws IOException, InterruptedException {
+    final int numChunksLeft = getNumChunksLeft();
+    if (numChunksLeft < 0) {// Un-initialized. i.e. Before 1st dir-scan.
+      assert numRecordsProcessedByThisMap <= numRecordsPerChunk
+              : "numRecordsProcessedByThisMap:" + numRecordsProcessedByThisMap +
+                " exceeds numRecordsPerChunk:" + numRecordsPerChunk;
+      return ((float) numRecordsProcessedByThisMap) / totalNumRecords;
+      // Conservative estimate, till the first directory scan.
+    }
+
+    return ((float) numRecordsProcessedByThisMap)
+            /(numRecordsProcessedByThisMap + numRecordsPerChunk*numChunksLeft);
+  }
+
+  private int getNumChunksLeft() throws IOException {
+    long now = System.currentTimeMillis();
+    boolean tooLongSinceLastDirScan
+                  = now - timeOfLastChunkDirScan > TIME_THRESHOLD_FOR_DIR_SCANS;
+
+    if (tooLongSinceLastDirScan
+            || (!isChunkDirAlreadyScanned &&
+                    numRecordsProcessedByThisMap%numRecordsPerChunk
+                              > numRecordsPerChunk/2)) {
+      DynamicInputChunk.getListOfChunkFiles();
+      isChunkDirAlreadyScanned = true;
+      timeOfLastChunkDirScan = now;
+    }
+
+    return DynamicInputChunk.getNumChunksLeft();
+  }
+  /**
+   * Implementation of RecordReader::close().
+   * Closes the RecordReader.
+   * @throws IOException, on failure.
+   */
+  @Override
+  public void close()
+      throws IOException {
+    if (chunk != null)
+        chunk.close();
+  }
+}

+ 343 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java

@@ -0,0 +1,343 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.util;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileChecksum;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+import org.apache.hadoop.tools.mapred.UniformSizeInputFormat;
+import org.apache.hadoop.tools.DistCpOptions;
+import org.apache.hadoop.mapreduce.InputFormat;
+
+import java.io.IOException;
+import java.util.EnumSet;
+import java.util.Locale;
+import java.text.DecimalFormat;
+import java.net.URI;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+/**
+ * Utility functions used in DistCp.
+ */
+public class DistCpUtils {
+
+  private static final Log LOG = LogFactory.getLog(DistCpUtils.class);
+
+  /**
+   * Retrieves size of the file at the specified path.
+   * @param path The path of the file whose size is sought.
+   * @param configuration Configuration, to retrieve the appropriate FileSystem.
+   * @return The file-size, in number of bytes.
+   * @throws IOException, on failure.
+   */
+  public static long getFileSize(Path path, Configuration configuration)
+                                            throws IOException {
+    if (LOG.isDebugEnabled())
+      LOG.debug("Retrieving file size for: " + path);
+    return path.getFileSystem(configuration).getFileStatus(path).getLen();
+  }
+
+  /**
+   * Utility to publish a value to a configuration.
+   * @param configuration The Configuration to which the value must be written.
+   * @param label The label for the value being published.
+   * @param value The value being published.
+   * @param <T> The type of the value.
+   */
+  public static <T> void publish(Configuration configuration,
+                                 String label, T value) {
+    configuration.set(label, String.valueOf(value));
+  }
+
+  /**
+   * Utility to retrieve a specified key from a Configuration. Throw exception
+   * if not found.
+   * @param configuration The Configuration in which the key is sought.
+   * @param label The key being sought.
+   * @return Integer value of the key.
+   */
+  public static int getInt(Configuration configuration, String label) {
+    int value = configuration.getInt(label, -1);
+    assert value >= 0 : "Couldn't find " + label;
+    return value;
+  }
+
+  /**
+   * Utility to retrieve a specified key from a Configuration. Throw exception
+   * if not found.
+   * @param configuration The Configuration in which the key is sought.
+   * @param label The key being sought.
+   * @return Long value of the key.
+   */
+  public static long getLong(Configuration configuration, String label) {
+    long value = configuration.getLong(label, -1);
+    assert value >= 0 : "Couldn't find " + label;
+    return value;
+  }
+
+  /**
+   * Returns the class that implements a copy strategy. Looks up the implementation for
+   * a particular strategy from distcp-default.xml
+   *
+   * @param conf - Configuration object
+   * @param options - Handle to input options
+   * @return Class implementing the strategy specified in options.
+   */
+  public static Class<? extends InputFormat> getStrategy(Configuration conf,
+                                                                 DistCpOptions options) {
+    String confLabel = "distcp." +
+        options.getCopyStrategy().toLowerCase(Locale.getDefault()) + ".strategy.impl";
+    return conf.getClass(confLabel, UniformSizeInputFormat.class, InputFormat.class);
+  }
+
+  /**
+   * Gets relative path of child path with respect to a root path
+   * For ex. If childPath = /tmp/abc/xyz/file and
+   *            sourceRootPath = /tmp/abc
+   * Relative path would be /xyz/file
+   *         If childPath = /file and
+   *            sourceRootPath = /
+   * Relative path would be /file
+   * @param sourceRootPath - Source root path
+   * @param childPath - Path for which relative path is required
+   * @return - Relative portion of the child path (always prefixed with /
+   *           unless it is empty 
+   */
+  public static String getRelativePath(Path sourceRootPath, Path childPath) {
+    String childPathString = childPath.toUri().getPath();
+    String sourceRootPathString = sourceRootPath.toUri().getPath();
+    return sourceRootPathString.equals("/") ? childPathString :
+        childPathString.substring(sourceRootPathString.length());
+  }
+
+  /**
+   * Pack file preservation attributes into a string, containing
+   * just the first character of each preservation attribute
+   * @param attributes - Attribute set to preserve
+   * @return - String containing first letters of each attribute to preserve
+   */
+  public static String packAttributes(EnumSet<FileAttribute> attributes) {
+    StringBuffer buffer = new StringBuffer(5);
+    int len = 0;
+    for (FileAttribute attribute : attributes) {
+      buffer.append(attribute.name().charAt(0));
+      len++;
+    }
+    return buffer.substring(0, len);
+  }
+
+  /**
+   * Un packs preservation attribute string containing the first character of
+   * each preservation attribute back to a set of attributes to preserve
+   * @param attributes - Attribute string
+   * @return - Attribute set
+   */
+  public static EnumSet<FileAttribute> unpackAttributes(String attributes) {
+    EnumSet<FileAttribute> retValue = EnumSet.noneOf(FileAttribute.class);
+
+    if (attributes != null) {
+      for (int index = 0; index < attributes.length(); index++) {
+        retValue.add(FileAttribute.getAttribute(attributes.charAt(index)));
+      }
+    }
+
+    return retValue;
+  }
+
+  /**
+   * Preserve attribute on file matching that of the file status being sent
+   * as argument. Barring the block size, all the other attributes are preserved
+   * by this function
+   *
+   * @param targetFS - File system
+   * @param path - Path that needs to preserve original file status
+   * @param srcFileStatus - Original file status
+   * @param attributes - Attribute set that need to be preserved
+   * @throws IOException - Exception if any (particularly relating to group/owner
+   *                       change or any transient error)
+   */
+  public static void preserve(FileSystem targetFS, Path path,
+                              FileStatus srcFileStatus,
+                              EnumSet<FileAttribute> attributes) throws IOException {
+
+    FileStatus targetFileStatus = targetFS.getFileStatus(path);
+    String group = targetFileStatus.getGroup();
+    String user = targetFileStatus.getOwner();
+    boolean chown = false;
+
+    if (attributes.contains(FileAttribute.PERMISSION) &&
+      !srcFileStatus.getPermission().equals(targetFileStatus.getPermission())) {
+      targetFS.setPermission(path, srcFileStatus.getPermission());
+    }
+
+    if (attributes.contains(FileAttribute.REPLICATION) && ! targetFileStatus.isDirectory() &&
+        srcFileStatus.getReplication() != targetFileStatus.getReplication()) {
+      targetFS.setReplication(path, srcFileStatus.getReplication());
+    }
+
+    if (attributes.contains(FileAttribute.GROUP) &&
+            !group.equals(srcFileStatus.getGroup())) {
+      group = srcFileStatus.getGroup();
+      chown = true;
+    }
+
+    if (attributes.contains(FileAttribute.USER) &&
+            !user.equals(srcFileStatus.getOwner())) {
+      user = srcFileStatus.getOwner();
+      chown = true;
+    }
+
+    if (chown) {
+      targetFS.setOwner(path, user, group);
+    }
+  }
+
+  /**
+   * Sort sequence file containing FileStatus and Text as key and value respecitvely
+   *
+   * @param fs - File System
+   * @param conf - Configuration
+   * @param sourceListing - Source listing file
+   * @return Path of the sorted file. Is source file with _sorted appended to the name
+   * @throws IOException - Any exception during sort.
+   */
+  public static Path sortListing(FileSystem fs, Configuration conf, Path sourceListing)
+      throws IOException {
+    SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, Text.class, FileStatus.class, conf);
+    Path output = new Path(sourceListing.toString() +  "_sorted");
+
+    if (fs.exists(output)) {
+      fs.delete(output, false);
+    }
+
+    sorter.sort(sourceListing, output);
+    return output;
+  }
+
+  /**
+   * String utility to convert a number-of-bytes to human readable format.
+   */
+  private static ThreadLocal<DecimalFormat> FORMATTER
+                        = new ThreadLocal<DecimalFormat>() {
+    @Override
+    protected DecimalFormat initialValue() {
+      return new DecimalFormat("0.0");
+    }
+  };
+
+  public static DecimalFormat getFormatter() {
+    return FORMATTER.get();
+  }
+
+  public static String getStringDescriptionFor(long nBytes) {
+
+    char units [] = {'B', 'K', 'M', 'G', 'T', 'P'};
+
+    double current = nBytes;
+    double prev    = current;
+    int index = 0;
+
+    while ((current = current/1024) >= 1) {
+      prev = current;
+      ++index;
+    }
+
+    assert index < units.length : "Too large a number.";
+
+    return getFormatter().format(prev) + units[index];
+  }
+
+  /**
+   * Utility to compare checksums for the paths specified.
+   *
+   * If checksums's can't be retrieved, it doesn't fail the test
+   * Only time the comparison would fail is when checksums are
+   * available and they don't match
+   *                                  
+   * @param sourceFS FileSystem for the source path.
+   * @param source The source path.
+   * @param targetFS FileSystem for the target path.
+   * @param target The target path.
+   * @return If either checksum couldn't be retrieved, the function returns
+   * false. If checksums are retrieved, the function returns true if they match,
+   * and false otherwise.
+   * @throws IOException if there's an exception while retrieving checksums.
+   */
+  public static boolean checksumsAreEqual(FileSystem sourceFS, Path source,
+                                   FileSystem targetFS, Path target)
+                                   throws IOException {
+    FileChecksum sourceChecksum = null;
+    FileChecksum targetChecksum = null;
+    try {
+      sourceChecksum = sourceFS.getFileChecksum(source);
+      targetChecksum = targetFS.getFileChecksum(target);
+    } catch (IOException e) {
+      LOG.error("Unable to retrieve checksum for " + source + " or " + target, e);
+    }
+    return (sourceChecksum == null || targetChecksum == null ||
+            sourceChecksum.equals(targetChecksum));
+  }
+
+  /* see if two file systems are the same or not
+   *
+   */
+  public static boolean compareFs(FileSystem srcFs, FileSystem destFs) {
+    URI srcUri = srcFs.getUri();
+    URI dstUri = destFs.getUri();
+    if (srcUri.getScheme() == null) {
+      return false;
+    }
+    if (!srcUri.getScheme().equals(dstUri.getScheme())) {
+      return false;
+    }
+    String srcHost = srcUri.getHost();
+    String dstHost = dstUri.getHost();
+    if ((srcHost != null) && (dstHost != null)) {
+      try {
+        srcHost = InetAddress.getByName(srcHost).getCanonicalHostName();
+        dstHost = InetAddress.getByName(dstHost).getCanonicalHostName();
+      } catch(UnknownHostException ue) {
+        if (LOG.isDebugEnabled())
+          LOG.debug("Could not compare file-systems. Unknown host: ", ue);
+        return false;
+      }
+      if (!srcHost.equals(dstHost)) {
+        return false;
+      }
+    }
+    else if (srcHost == null && dstHost != null) {
+      return false;
+    }
+    else if (srcHost != null) {
+      return false;
+    }
+
+    //check for ports
+
+    return srcUri.getPort() == dstUri.getPort();
+  }
+}

+ 106 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java

@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.hadoop.tools.util;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.io.retry.RetryPolicies;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * This class represents commands that be retried on failure, in a configurable
+ * manner.
+ */
+public abstract class RetriableCommand {
+
+  private static Log LOG = LogFactory.getLog(RetriableCommand.class);
+
+  private static final long DELAY_MILLISECONDS = 500;
+  private static final int  MAX_RETRIES        = 3;
+
+  private RetryPolicy retryPolicy = RetryPolicies.
+      exponentialBackoffRetry(MAX_RETRIES, DELAY_MILLISECONDS, TimeUnit.MILLISECONDS);
+  protected String description;
+
+  /**
+   * Constructor.
+   * @param description The human-readable description of the command.
+   */
+  public RetriableCommand(String description) {
+    this.description = description;
+  }
+
+  /**
+   * Constructor.
+   * @param description The human-readable description of the command.
+   * @param retryPolicy The RetryHandler to be used to compute retries.
+   */
+  public RetriableCommand(String description, RetryPolicy retryPolicy) {
+    this(description);
+    setRetryPolicy(retryPolicy);
+  }
+
+  /**
+   * Implement this interface-method define the command-logic that will be
+   * retried on failure (i.e. with Exception).
+   * @param arguments Argument-list to the command.
+   * @return Generic "Object".
+   * @throws Exception Throws Exception on complete failure.
+   */
+  protected abstract Object doExecute(Object... arguments) throws Exception;
+
+  /**
+   * The execute() method invokes doExecute() until either:
+   *  1. doExecute() succeeds, or
+   *  2. the command may no longer be retried (e.g. runs out of retry-attempts).
+   * @param arguments The list of arguments for the command.
+   * @return Generic "Object" from doExecute(), on success.
+   * @throws IOException, IOException, on complete failure.
+   */
+  public Object execute(Object... arguments) throws Exception {
+    Exception latestException;
+    int counter = 0;
+    do {
+      try {
+        return doExecute(arguments);
+      } catch(Exception exception) {
+        LOG.error("Failure in Retriable command: " + description, exception);
+        latestException = exception;
+      }
+      counter++;
+    } while (retryPolicy.shouldRetry(latestException, counter, 0, true).equals(RetryPolicy.RetryAction.RETRY));
+
+    throw new IOException("Couldn't run retriable-command: " + description,
+                          latestException);
+  }
+
+  /**
+   * Fluent-interface to change the RetryHandler.
+   * @param retryHandler The new RetryHandler instance to be used.
+   * @return Self.
+   */
+  public RetriableCommand setRetryPolicy(RetryPolicy retryHandler) {
+    this.retryPolicy = retryHandler;
+    return this;
+  }
+}

+ 139 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/ThrottledInputStream.java

@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * The ThrottleInputStream provides bandwidth throttling on a specified
+ * InputStream. It is implemented as a wrapper on top of another InputStream
+ * instance.
+ * The throttling works by examining the number of bytes read from the underlying
+ * InputStream from the beginning, and sleep()ing for a time interval if
+ * the byte-transfer is found exceed the specified tolerable maximum.
+ * (Thus, while the read-rate might exceed the maximum for a given short interval,
+ * the average tends towards the specified maximum, overall.)
+ */
+public class ThrottledInputStream extends InputStream {
+
+  private final InputStream rawStream;
+  private final long maxBytesPerSec;
+  private final long startTime = System.currentTimeMillis();
+
+  private long bytesRead = 0;
+  private long totalSleepTime = 0;
+
+  private static final long SLEEP_DURATION_MS = 50;
+
+  public ThrottledInputStream(InputStream rawStream) {
+    this(rawStream, Long.MAX_VALUE);
+  }
+
+  public ThrottledInputStream(InputStream rawStream, long maxBytesPerSec) {
+    assert maxBytesPerSec > 0 : "Bandwidth " + maxBytesPerSec + " is invalid"; 
+    this.rawStream = rawStream;
+    this.maxBytesPerSec = maxBytesPerSec;
+  }
+
+  /** @inheritDoc */
+  @Override
+  public int read() throws IOException {
+    throttle();
+    int data = rawStream.read();
+    if (data != -1) {
+      bytesRead++;
+    }
+    return data;
+  }
+
+  /** @inheritDoc */
+  @Override
+  public int read(byte[] b) throws IOException {
+    throttle();
+    int readLen = rawStream.read(b);
+    if (readLen != -1) {
+      bytesRead += readLen;
+    }
+    return readLen;
+  }
+
+  /** @inheritDoc */
+  @Override
+  public int read(byte[] b, int off, int len) throws IOException {
+    throttle();
+    int readLen = rawStream.read(b, off, len);
+    if (readLen != -1) {
+      bytesRead += readLen;
+    }
+    return readLen;
+  }
+
+  private void throttle() throws IOException {
+    if (getBytesPerSec() > maxBytesPerSec) {
+      try {
+        Thread.sleep(SLEEP_DURATION_MS);
+        totalSleepTime += SLEEP_DURATION_MS;
+      } catch (InterruptedException e) {
+        throw new IOException("Thread aborted", e);
+      }
+    }
+  }
+
+  /**
+   * Getter for the number of bytes read from this stream, since creation.
+   * @return The number of bytes.
+   */
+  public long getTotalBytesRead() {
+    return bytesRead;
+  }
+
+  /**
+   * Getter for the read-rate from this stream, since creation.
+   * Calculated as bytesRead/elapsedTimeSinceStart.
+   * @return Read rate, in bytes/sec.
+   */
+  public long getBytesPerSec() {
+    long elapsed = (System.currentTimeMillis() - startTime) / 1000;
+    if (elapsed == 0) {
+      return bytesRead;
+    } else {
+      return bytesRead / elapsed;
+    }
+  }
+
+  /**
+   * Getter the total time spent in sleep.
+   * @return Number of milliseconds spent in sleep.
+   */
+  public long getTotalSleepTime() {
+    return totalSleepTime;
+  }
+
+  /** @inheritDoc */
+  @Override
+  public String toString() {
+    return "ThrottledInputStream{" +
+        "bytesRead=" + bytesRead +
+        ", maxBytesPerSec=" + maxBytesPerSec +
+        ", bytesPerSec=" + getBytesPerSec() +
+        ", totalSleepTime=" + totalSleepTime +
+        '}';
+  }
+}

+ 41 - 0
hadoop-tools/hadoop-distcp/src/main/resources/distcp-default.xml

@@ -0,0 +1,41 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Do not modify this file directly. Anything that need to be overwritten,
+     need to be done so, through -D switches or customized conf -->
+
+<configuration>
+
+    <property>
+        <name>distcp.dynamic.strategy.impl</name>
+        <value>org.apache.hadoop.tools.mapred.lib.DynamicInputFormat</value>
+        <description>Implementation of dynamic input format</description>
+    </property>
+
+    <property>
+        <name>distcp.static.strategy.impl</name>
+        <value>org.apache.hadoop.tools.mapred.UniformSizeInputFormat</value>
+        <description>Implementation of static input format</description>
+    </property>
+
+    <property>
+        <name>mapred.job.map.memory.mb</name>
+        <value>1024</value>
+    </property>
+
+    <property>
+        <name>mapred.job.reduce.memory.mb</name>
+        <value>1024</value>
+    </property>
+
+    <property>
+        <name>mapred.reducer.new-api</name>
+        <value>true</value>
+    </property>
+
+    <property>
+        <name>mapreduce.reduce.class</name>
+        <value>org.apache.hadoop.mapreduce.Reducer</value>
+    </property>
+
+</configuration>

+ 98 - 0
hadoop-tools/hadoop-distcp/src/site/fml/faq.fml

@@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<faqs xmlns="http://maven.apache.org/FML/1.0.1"
+      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+      xsi:schemaLocation="http://maven.apache.org/FML/1.0.1 http://maven.apache.org/xsd/fml-1.0.1.xsd"
+      title="Frequently Asked Questions">
+  <part id="General">
+    <title>General</title>
+
+    <faq id="Update">
+      <question>Why does -update not create the parent source-directory under
+      a pre-existing target directory?</question>
+      <answer>The behaviour of <code>-update</code> and <code>-overwrite</code>
+      is described in detail in the Usage section of this document. In short,
+      if either option is used with a pre-existing destination directory, the
+      <strong>contents</strong> of each source directory is copied over, rather
+      than the source-directory itself.
+      This behaviour is consistent with the legacy DistCp implementation as well.
+      </answer>
+    </faq>
+
+    <faq id="Deviation">
+      <question>How does the new DistCp differ in semantics from the Legacy
+      DistCp?</question>
+      <answer>
+          <ul>
+              <li>Files that are skipped during copy used to also have their
+              file-attributes (permissions, owner/group info, etc.) unchanged,
+              when copied with Legacy DistCp. These are now updated, even if
+              the file-copy is skipped.</li>
+              <li>Empty root directories among the source-path inputs were not
+              created at the target, in Legacy DistCp. These are now created.</li>
+          </ul>
+      </answer>
+    </faq>
+
+    <faq id="nMaps">
+      <question>Why does the new DistCp use more maps than legacy DistCp?</question>
+      <answer>
+          <p>Legacy DistCp works by figuring out what files need to be actually
+      copied to target <strong>before</strong> the copy-job is launched, and then
+      launching as many maps as required for copy. So if a majority of the files
+      need to be skipped (because they already exist, for example), fewer maps
+      will be needed. As a consequence, the time spent in setup (i.e. before the
+      M/R job) is higher.</p>
+          <p>The new DistCp calculates only the contents of the source-paths. It
+      doesn't try to filter out what files can be skipped. That decision is put-
+      off till the M/R job runs. This is much faster (vis-a-vis execution-time),
+      but the number of maps launched will be as specified in the <code>-m</code>
+      option, or 20 (default) if unspecified.</p>
+      </answer>
+    </faq>
+
+    <faq id="more_maps">
+      <question>Why does DistCp not run faster when more maps are specified?</question>
+      <answer>
+          <p>At present, the smallest unit of work for DistCp is a file. i.e.,
+          a file is processed by only one map. Increasing the number of maps to
+          a value exceeding the number of files would yield no performance
+          benefit. The number of maps lauched would equal the number of files.</p>
+      </answer>
+    </faq>
+
+    <faq id="client_mem">
+      <question>Why does DistCp run out of memory?</question>
+      <answer>
+          <p>If the number of individual files/directories being copied from
+      the source path(s) is extremely large (e.g. 1,000,000 paths), DistCp might
+      run out of memory while determining the list of paths for copy. This is
+      not unique to the new DistCp implementation.</p>
+          <p>To get around this, consider changing the <code>-Xmx</code> JVM
+      heap-size parameters, as follows:</p>
+          <p><code>bash$ export HADOOP_CLIENT_OPTS="-Xms64m -Xmx1024m"</code></p>
+          <p><code>bash$ hadoop distcp /source /target</code></p>
+      </answer>
+    </faq>
+
+  </part>
+</faqs>

+ 47 - 0
hadoop-tools/hadoop-distcp/src/site/pdf.xml

@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<!-- START SNIPPET: docDescriptor -->
+<document xmlns="http://maven.apache.org/DOCUMENT/1.0.1"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/DOCUMENT/1.0.1 http://maven.apache.org/xsd/document-1.0.1.xsd"
+  outputName="distcp">
+
+  <meta>
+    <title>${project.name}</title>
+  </meta>
+
+  <toc name="Table of Contents">
+    <item name="Introduction" ref="index.xml"/>
+    <item name="Usage" ref="usage.xml"/>
+    <item name="Command Line Reference" ref="cli.xml"/>
+    <item name="Architecture" ref="architecture.xml"/>
+    <item name="Appendix" ref="appendix.xml"/>
+    <item name="FAQ" ref="faq.fml"/>
+  </toc>
+  <cover>
+    <coverTitle>${project.name}</coverTitle>
+    <coverSubTitle>v. ${project.version}</coverSubTitle>
+    <coverType>User Guide</coverType>
+    <projectName>${project.name}</projectName>
+    <companyName>Apache Hadoop</companyName>
+  </cover>
+</document>

+ 125 - 0
hadoop-tools/hadoop-distcp/src/site/xdoc/appendix.xml

@@ -0,0 +1,125 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document xmlns="http://maven.apache.org/XDOC/2.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/XDOC/2.0 http://maven.apache.org/xsd/xdoc-2.0.xsd">
+  <head>
+    <title>Appendix</title>
+  </head>
+  <body>
+    <section name="Map sizing">
+ 
+      <p> By default, DistCp makes an attempt to size each map comparably so
+      that each copies roughly the same number of bytes. Note that files are the
+      finest level of granularity, so increasing the number of simultaneous
+      copiers (i.e. maps) may not always increase the number of
+      simultaneous copies nor the overall throughput.</p>
+
+      <p> The new DistCp also provides a strategy to "dynamically" size maps,
+      allowing faster data-nodes to copy more bytes than slower nodes. Using
+      <code>-strategy dynamic</code> (explained in the Architecture), rather
+      than to assign a fixed set of source-files to each map-task, files are
+      instead split into several sets. The number of sets exceeds the number of
+      maps, usually by a factor of 2-3. Each map picks up and copies all files
+      listed in a chunk. When a chunk is exhausted, a new chunk is acquired and
+      processed, until no more chunks remain.</p>
+
+      <p> By not assigning a source-path to a fixed map, faster map-tasks (i.e.
+      data-nodes) are able to consume more chunks, and thus copy more data,
+      than slower nodes. While this distribution isn't uniform, it is
+      <strong>fair</strong> with regard to each mapper's capacity.</p>
+
+      <p>The dynamic-strategy is implemented by the DynamicInputFormat. It
+      provides superior performance under most conditions. </p>
+
+      <p>Tuning the number of maps to the size of the source and
+      destination clusters, the size of the copy, and the available
+      bandwidth is recommended for long-running and regularly run jobs.</p>
+
+   </section>
+
+   <section name="Copying between versions of HDFS">
+
+        <p>For copying between two different versions of Hadoop, one will
+        usually use HftpFileSystem. This is a read-only FileSystem, so DistCp
+        must be run on the destination cluster (more specifically, on
+        TaskTrackers that can write to the destination cluster). Each source is
+        specified as <code>hftp://&lt;dfs.http.address&gt;/&lt;path&gt;</code>
+        (the default <code>dfs.http.address</code> is
+        &lt;namenode&gt;:50070).</p>
+
+   </section>
+
+   <section name="Map/Reduce and other side-effects">
+
+        <p>As has been mentioned in the preceding, should a map fail to copy
+        one of its inputs, there will be several side-effects.</p>
+
+        <ul>
+
+          <li>Unless <code>-overwrite</code> is specified, files successfully
+          copied by a previous map on a re-execution will be marked as
+          &quot;skipped&quot;.</li>
+
+          <li>If a map fails <code>mapred.map.max.attempts</code> times, the
+          remaining map tasks will be killed (unless <code>-i</code> is
+          set).</li>
+
+          <li>If <code>mapred.speculative.execution</code> is set set
+          <code>final</code> and <code>true</code>, the result of the copy is
+          undefined.</li>
+
+        </ul>
+
+   </section>
+
+   <section name="SSL Configurations for HSFTP sources:">
+
+       <p>To use an HSFTP source (i.e. using the hsftp protocol), a Map-Red SSL
+       configuration file needs to be specified (via the <code>-mapredSslConf</code>
+       option). This must specify 3 parameters:</p>
+
+       <ul>
+           <li><code>ssl.client.truststore.location</code>: The local-filesystem
+            location of the trust-store file, containing the certificate for
+            the namenode.</li>
+
+           <li><code>ssl.client.truststore.type</code>: (Optional) The format of
+           the trust-store file.</li>
+
+           <li><code>ssl.client.truststore.password</code>: (Optional) Password
+           for the trust-store file.</li>
+
+       </ul>
+
+       <p>The following is an example of the contents of the contents of
+       a Map-Red SSL Configuration file:</p>
+
+           <p> <br/> <code> &lt;configuration&gt; </code> </p>
+
+           <p> <br/> <code>&lt;property&gt; </code> </p>
+           <p> <code>&lt;name&gt;ssl.client.truststore.location&lt;/name&gt; </code> </p>
+           <p> <code>&lt;value&gt;/work/keystore.jks&lt;/value&gt; </code> </p>
+           <p> <code>&lt;description&gt;Truststore to be used by clients like distcp. Must be specified. &lt;/description&gt;</code> </p>
+           <p> <br/> <code>&lt;/property&gt; </code> </p>
+
+           <p><code> &lt;property&gt; </code> </p>
+           <p> <code>&lt;name&gt;ssl.client.truststore.password&lt;/name&gt; </code> </p>
+           <p> <code>&lt;value&gt;changeme&lt;/value&gt; </code> </p>
+           <p> <code>&lt;description&gt;Optional. Default value is "". &lt;/description&gt;  </code> </p>
+           <p> <code>&lt;/property&gt; </code>  </p>
+
+           <p> <br/> <code> &lt;property&gt; </code> </p>
+           <p> <code> &lt;name&gt;ssl.client.truststore.type&lt;/name&gt;</code>  </p>
+           <p> <code> &lt;value&gt;jks&lt;/value&gt;</code>  </p>
+           <p> <code> &lt;description&gt;Optional. Default value is "jks". &lt;/description&gt;</code>  </p>
+           <p> <code> &lt;/property&gt; </code> </p>
+
+           <p> <code> <br/> &lt;/configuration&gt; </code> </p>
+
+       <p><br/>The SSL configuration file must be in the class-path of the 
+       DistCp program.</p>
+
+   </section>
+
+  </body>
+</document>

+ 200 - 0
hadoop-tools/hadoop-distcp/src/site/xdoc/architecture.xml

@@ -0,0 +1,200 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document xmlns="http://maven.apache.org/XDOC/2.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/XDOC/2.0 http://maven.apache.org/xsd/xdoc-2.0.xsd">
+    <head>
+        <title>Architecture of DistCp</title>
+    </head>
+    <body>
+      <section name="Architecture">
+
+        <p>The components of the new DistCp may be classified into the following
+           categories: </p>
+
+        <ul>
+
+          <li>DistCp Driver</li>
+          <li>Copy-listing generator</li>
+          <li>Input-formats and Map-Reduce components</li>
+
+        </ul>
+
+        <subsection name="DistCp Driver">
+          <p>The DistCp Driver components are responsible for:</p>
+
+          <ul>
+            <li>Parsing the arguments passed to the DistCp command on the
+                command-line, via:
+              <ul>
+                <li>OptionsParser, and</li>
+                <li>DistCpOptionsSwitch</li>
+              </ul>
+            </li>
+            <li>Assembling the command arguments into an appropriate
+                DistCpOptions object, and initializing DistCp. These arguments
+                include:
+              <ul>
+                <li>Source-paths</li>
+                <li>Target location</li>
+                <li>Copy options (e.g. whether to update-copy, overwrite, which
+                    file-attributes to preserve, etc.)</li>
+              </ul>
+            </li>
+            <li>Orchestrating the copy operation by:
+              <ul>
+                <li>Invoking the copy-listing-generator to create the list of
+                    files to be copied.</li>
+                <li>Setting up and launching the Hadoop Map-Reduce Job to carry
+                    out the copy.</li>
+                <li>Based on the options, either returning a handle to the
+                    Hadoop MR Job immediately, or waiting till completion.</li>
+              </ul>
+            </li>
+          </ul>
+          <br/>
+
+          <p>The parser-elements are exercised only from the command-line (or if
+             DistCp::run() is invoked). The DistCp class may also be used
+             programmatically, by constructing the DistCpOptions object, and
+             initializing a DistCp object appropriately.</p>
+
+        </subsection>
+
+        <subsection name="Copy-listing generator">
+
+          <p>The copy-listing-generator classes are responsible for creating the
+             list of files/directories to be copied from source. They examine
+             the contents of the source-paths (files/directories, including
+             wild-cards), and record all paths that need copy into a sequence-
+             file, for consumption by the DistCp Hadoop Job. The main classes in
+             this module include:</p>
+
+          <ol>
+
+            <li>CopyListing: The interface that should be implemented by any 
+                copy-listing-generator implementation. Also provides the factory
+                method by which the concrete CopyListing implementation is
+                chosen.</li>
+
+            <li>SimpleCopyListing: An implementation of CopyListing that accepts
+                multiple source paths (files/directories), and recursively lists
+                all the individual files and directories under each, for
+                copy.</li>
+
+            <li>GlobbedCopyListing: Another implementation of CopyListing that
+                expands wild-cards in the source paths.</li>
+
+            <li>FileBasedCopyListing: An implementation of CopyListing that
+                reads the source-path list from a specified file.</li>
+
+          </ol>
+          <p/>
+
+          <p>Based on whether a source-file-list is specified in the
+             DistCpOptions, the source-listing is generated in one of the
+             following ways:</p>
+
+          <ol>
+
+            <li>If there's no source-file-list, the GlobbedCopyListing is used.
+                All wild-cards are expanded, and all the expansions are
+                forwarded to the SimpleCopyListing, which in turn constructs the
+                listing (via recursive descent of each path). </li>
+
+            <li>If a source-file-list is specified, the FileBasedCopyListing is
+                used. Source-paths are read from the specified file, and then
+                forwarded to the GlobbedCopyListing. The listing is then
+                constructed as described above.</li>
+
+          </ol>
+
+          <br/>
+
+          <p>One may customize the method by which the copy-listing is
+             constructed by providing a custom implementation of the CopyListing
+             interface. The behaviour of DistCp differs here from the legacy
+             DistCp, in how paths are considered for copy. </p>
+
+          <p>The legacy implementation only lists those paths that must
+             definitely be copied on to target.
+             E.g. if a file already exists at the target (and -overwrite isn't
+             specified), the file isn't even considered in the Map-Reduce Copy
+             Job. Determining this during setup (i.e. before the Map-Reduce Job)
+             involves file-size and checksum-comparisons that are potentially
+             time-consuming.</p>
+
+          <p>The new DistCp postpones such checks until the Map-Reduce Job, thus
+             reducing setup time. Performance is enhanced further since these
+             checks are parallelized across multiple maps.</p>
+
+        </subsection>
+
+        <subsection name="Input-formats and Map-Reduce components">
+
+          <p> The Input-formats and Map-Reduce components are responsible for
+              the actual copy of files and directories from the source to the
+              destination path. The listing-file created during copy-listing
+              generation is consumed at this point, when the copy is carried
+              out. The classes of interest here include:</p>
+
+          <ul>
+            <li><strong>UniformSizeInputFormat:</strong> This implementation of
+                org.apache.hadoop.mapreduce.InputFormat provides equivalence
+                with Legacy DistCp in balancing load across maps.
+                The aim of the UniformSizeInputFormat is to make each map copy
+                roughly the same number of bytes. Apropos, the listing file is
+                split into groups of paths, such that the sum of file-sizes in
+                each InputSplit is nearly equal to every other map. The splitting
+                isn't always perfect, but its trivial implementation keeps the
+                setup-time low.</li>
+
+            <li><strong>DynamicInputFormat and DynamicRecordReader:</strong>
+                <p> The DynamicInputFormat implements org.apache.hadoop.mapreduce.InputFormat,
+                and is new to DistCp. The listing-file is split into several
+                "chunk-files", the exact number of chunk-files being a multiple
+                of the number of maps requested for in the Hadoop Job. Each map
+                task is "assigned" one of the chunk-files (by renaming the chunk
+                to the task's id), before the Job is launched.</p>
+
+                <p>Paths are read from each chunk using the DynamicRecordReader,
+                and processed in the CopyMapper. After all the paths in a chunk
+                are processed, the current chunk is deleted and a new chunk is
+                acquired. The process continues until no more chunks are
+                available.</p>
+                <p>This "dynamic" approach allows faster map-tasks to consume
+                more paths than slower ones, thus speeding up the DistCp job
+                overall. </p>
+            </li>
+
+            <li><strong>CopyMapper:</strong> This class implements the physical
+                file-copy. The input-paths are checked against the input-options
+                (specified in the Job's Configuration), to determine whether a
+                file needs copy. A file will be copied only if at least one of
+                the following is true:
+              <ul>
+                <li>A file with the same name doesn't exist at target.</li>
+                <li>A file with the same name exists at target, but has a
+                    different file size.</li>
+                <li>A file with the same name exists at target, but has a
+                    different checksum, and -skipcrccheck isn't mentioned.</li>
+                <li>A file with the same name exists at target, but -overwrite
+                    is specified.</li>
+                <li>A file with the same name exists at target, but differs in
+                    block-size (and block-size needs to be preserved.</li>
+              </ul>
+            </li>
+
+            <li><strong>CopyCommitter:</strong>
+                This class is responsible for the commit-phase of the DistCp
+                job, including:
+              <ul>
+                <li>Preservation of directory-permissions (if specified in the
+                    options)</li>
+                <li>Clean-up of temporary-files, work-directories, etc.</li>
+              </ul>
+            </li>
+          </ul>
+        </subsection>
+      </section>
+    </body>
+</document>

+ 123 - 0
hadoop-tools/hadoop-distcp/src/site/xdoc/cli.xml

@@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document xmlns="http://maven.apache.org/XDOC/2.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/XDOC/2.0 http://maven.apache.org/xsd/xdoc-2.0.xsd">
+  <head>
+    <title>Command Line Options</title>
+  </head>
+  <body>
+      <section name="Options Index"> 
+        <table>
+          <tr><th> Flag </th><th> Description </th><th> Notes </th></tr>
+
+          <tr><td><code>-p[rbugp]</code></td>
+              <td>Preserve<br/>
+                  r: replication number<br/>
+                  b: block size<br/>
+                  u: user<br/>
+                  g: group<br/>
+                  p: permission<br/></td>
+              <td>Modification times are not preserved. Also, when
+              <code>-update</code> is specified, status updates will
+              <strong>not</strong> be synchronized unless the file sizes
+              also differ (i.e. unless the file is re-created).
+              </td></tr>
+          <tr><td><code>-i</code></td>
+              <td>Ignore failures</td>
+              <td>As explained in the Appendix, this option
+              will keep more accurate statistics about the copy than the
+              default case. It also preserves logs from failed copies, which
+              can be valuable for debugging. Finally, a failing map will not
+              cause the job to fail before all splits are attempted.
+              </td></tr>
+          <tr><td><code>-log &lt;logdir&gt;</code></td>
+              <td>Write logs to &lt;logdir&gt;</td>
+              <td>DistCp keeps logs of each file it attempts to copy as map
+              output. If a map fails, the log output will not be retained if
+              it is re-executed.
+              </td></tr>
+          <tr><td><code>-m &lt;num_maps&gt;</code></td>
+              <td>Maximum number of simultaneous copies</td>
+              <td>Specify the number of maps to copy data. Note that more maps
+              may not necessarily improve throughput.
+              </td></tr>
+          <tr><td><code>-overwrite</code></td>
+              <td>Overwrite destination</td>
+              <td>If a map fails and <code>-i</code> is not specified, all the
+              files in the split, not only those that failed, will be recopied.
+              As discussed in the Usage documentation, it also changes
+              the semantics for generating destination paths, so users should
+              use this carefully.
+              </td></tr>
+          <tr><td><code>-update</code></td>
+              <td>Overwrite if src size different from dst size</td>
+              <td>As noted in the preceding, this is not a &quot;sync&quot;
+              operation. The only criterion examined is the source and
+              destination file sizes; if they differ, the source file
+              replaces the destination file. As discussed in the
+              Usage documentation, it also changes the semantics for
+              generating destination paths, so users should use this carefully.
+              </td></tr>
+          <tr><td><code>-f &lt;urilist_uri&gt;</code></td>
+              <td>Use list at &lt;urilist_uri&gt; as src list</td>
+              <td>This is equivalent to listing each source on the command
+              line. The <code>urilist_uri</code> list should be a fully
+              qualified URI.
+              </td></tr>
+          <tr><td><code>-filelimit &lt;n&gt;</code></td>
+              <td>Limit the total number of files to be &lt;= n</td>
+              <td><strong>Deprecated!</strong> Ignored in the new DistCp.
+              </td></tr>
+          <tr><td><code>-sizelimit &lt;n&gt;</code></td>
+              <td>Limit the total size to be &lt;= n bytes</td>
+              <td><strong>Deprecated!</strong> Ignored in the new DistCp.
+              </td></tr>
+          <tr><td><code>-delete</code></td>
+              <td>Delete the files existing in the dst but not in src</td>
+              <td>The deletion is done by FS Shell.  So the trash will be used,
+                  if it is enable.
+              </td></tr>
+          <tr><td><code>-strategy {dynamic|uniformsize}</code></td>
+              <td>Choose the copy-strategy to be used in DistCp.</td>
+              <td>By default, uniformsize is used. (i.e. Maps are balanced on the
+                  total size of files copied by each map. Similar to legacy.)
+                  If "dynamic" is specified, <code>DynamicInputFormat</code> is
+                  used instead. (This is described in the Architecture section,
+                  under InputFormats.)
+              </td></tr>
+          <tr><td><code>-bandwidth</code></td>
+                <td>Specify bandwidth per map, in MB/second.</td>
+                <td>Each map will be restricted to consume only the specified
+                    bandwidth. This is not always exact. The map throttles back
+                    its bandwidth consumption during a copy, such that the
+                    <strong>net</strong> bandwidth used tends towards the
+                    specified value.
+                </td></tr>
+          <tr><td><code>-atomic {-tmp &lt;tmp_dir&gt;}</code></td>
+                <td>Specify atomic commit, with optional tmp directory.</td>
+                <td><code>-atomic</code> instructs DistCp to copy the source
+                    data to a temporary target location, and then move the
+                    temporary target to the final-location atomically. Data will
+                    either be available at final target in a complete and consistent
+                    form, or not at all.
+                    Optionally, <code>-tmp</code> may be used to specify the
+                    location of the tmp-target. If not specified, a default is
+                    chosen. <strong>Note:</strong> tmp_dir must be on the final
+                    target cluster.
+                </td></tr>
+            <tr><td><code>-mapredSslConf &lt;ssl_conf_file&gt;</code></td>
+                  <td>Specify SSL Config file, to be used with HSFTP source</td>
+                  <td>When using the hsftp protocol with a source, the security-
+                      related properties may be specified in a config-file and
+                      passed to DistCp. &lt;ssl_conf_file&gt; needs to be in
+                      the classpath.
+                  </td></tr>
+            <tr><td><code>-async</code></td>
+                  <td>Run DistCp asynchronously. Quits as soon as the Hadoop
+                  Job is launched.</td>
+                  <td>The Hadoop Job-id is logged, for tracking.
+                  </td></tr>
+        </table>
+      </section>
+  </body>
+</document>

+ 32 - 0
hadoop-tools/hadoop-distcp/src/site/xdoc/index.xml

@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document xmlns="http://maven.apache.org/XDOC/2.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/XDOC/2.0 http://maven.apache.org/xsd/xdoc-2.0.xsd">
+  <head>
+    <title>DistCp</title>
+  </head>
+  <body>
+    <section name="Overview">
+      <p>
+        DistCp (distributed copy) is a tool used for large inter/intra-cluster
+      copying. It uses Map/Reduce to effect its distribution, error
+      handling and recovery, and reporting. It expands a list of files and
+      directories into input to map tasks, each of which will copy a partition
+      of the files specified in the source list.
+      </p>
+      <p>
+       The erstwhile implementation of DistCp has its share of quirks and
+       drawbacks, both in its usage, as well as its extensibility and
+       performance. The purpose of the DistCp refactor was to fix these shortcomings,
+       enabling it to be used and extended programmatically. New paradigms have
+       been introduced to improve runtime and setup performance, while simultaneously
+       retaining the legacy behaviour as default.
+      </p>
+      <p>
+       This document aims to describe the design of the new DistCp, its spanking
+       new features, their optimal use, and any deviance from the legacy
+       implementation.
+      </p>
+    </section>
+  </body>
+</document>

+ 147 - 0
hadoop-tools/hadoop-distcp/src/site/xdoc/usage.xml

@@ -0,0 +1,147 @@
+<document xmlns="http://maven.apache.org/XDOC/2.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/XDOC/2.0 http://maven.apache.org/xsd/xdoc-2.0.xsd">
+  <head>
+    <title>Usage </title>
+  </head>
+  <body>
+    <section name="Basic Usage">
+        <p>The most common invocation of DistCp is an inter-cluster copy:</p>
+        <p><code>bash$ hadoop jar hadoop-distcp.jar hdfs://nn1:8020/foo/bar \</code><br/>
+           <code>                    hdfs://nn2:8020/bar/foo</code></p>
+
+        <p>This will expand the namespace under <code>/foo/bar</code> on nn1
+        into a temporary file, partition its contents among a set of map
+        tasks, and start a copy on each TaskTracker from nn1 to nn2.</p>
+
+        <p>One can also specify multiple source directories on the command
+        line:</p>
+        <p><code>bash$ hadoop jar hadoop-distcp.jar hdfs://nn1:8020/foo/a \</code><br/>
+           <code> hdfs://nn1:8020/foo/b \</code><br/>
+           <code> hdfs://nn2:8020/bar/foo</code></p>
+
+        <p>Or, equivalently, from a file using the <code>-f</code> option:<br/>
+        <code>bash$ hadoop jar hadoop-distcp.jar -f hdfs://nn1:8020/srclist \</code><br/>
+        <code> hdfs://nn2:8020/bar/foo</code><br/></p>
+
+        <p>Where <code>srclist</code> contains<br/>
+        <code>hdfs://nn1:8020/foo/a</code><br/>
+        <code>hdfs://nn1:8020/foo/b</code></p>
+
+        <p>When copying from multiple sources, DistCp will abort the copy with
+        an error message if two sources collide, but collisions at the
+        destination are resolved per the <a href="#options">options</a>
+        specified. By default, files already existing at the destination are
+        skipped (i.e. not replaced by the source file). A count of skipped
+        files is reported at the end of each job, but it may be inaccurate if a
+        copier failed for some subset of its files, but succeeded on a later
+        attempt.</p>
+
+        <p>It is important that each TaskTracker can reach and communicate with
+        both the source and destination file systems. For HDFS, both the source
+        and destination must be running the same version of the protocol or use
+        a backwards-compatible protocol (see <a href="#cpver">Copying Between
+        Versions</a>).</p>
+
+        <p>After a copy, it is recommended that one generates and cross-checks
+        a listing of the source and destination to verify that the copy was
+        truly successful. Since DistCp employs both Map/Reduce and the
+        FileSystem API, issues in or between any of the three could adversely
+        and silently affect the copy. Some have had success running with
+        <code>-update</code> enabled to perform a second pass, but users should
+        be acquainted with its semantics before attempting this.</p>
+
+        <p>It's also worth noting that if another client is still writing to a
+        source file, the copy will likely fail. Attempting to overwrite a file
+        being written at the destination should also fail on HDFS. If a source
+        file is (re)moved before it is copied, the copy will fail with a
+        FileNotFoundException.</p>
+
+        <p>Please refer to the detailed Command Line Reference for information
+        on all the options available in DistCp.</p>
+        
+    </section>
+    <section name="Update and Overwrite">
+
+        <p><code>-update</code> is used to copy files from source that don't
+        exist at the target, or have different contents. <code>-overwrite</code>
+        overwrites target-files even if they exist at the source, or have the
+        same contents.</p>
+
+        <p><br/>Update and Overwrite options warrant special attention, since their
+        handling of source-paths varies from the defaults in a very subtle manner.
+        Consider a copy from <code>/source/first/</code> and
+        <code>/source/second/</code> to <code>/target/</code>, where the source
+        paths have the following contents:</p>
+
+        <p><code>hdfs://nn1:8020/source/first/1</code><br/>
+           <code>hdfs://nn1:8020/source/first/2</code><br/>
+           <code>hdfs://nn1:8020/source/second/10</code><br/>
+           <code>hdfs://nn1:8020/source/second/20</code><br/></p>
+
+        <p><br/>When DistCp is invoked without <code>-update</code> or
+        <code>-overwrite</code>, the DistCp defaults would create directories
+        <code>first/</code> and <code>second/</code>, under <code>/target</code>.
+        Thus:<br/></p>
+
+        <p><code>distcp hdfs://nn1:8020/source/first hdfs://nn1:8020/source/second hdfs://nn2:8020/target</code></p>
+        <p><br/>would yield the following contents in <code>/target</code>: </p>
+
+        <p><code>hdfs://nn2:8020/target/first/1</code><br/>
+           <code>hdfs://nn2:8020/target/first/2</code><br/>
+           <code>hdfs://nn2:8020/target/second/10</code><br/>
+           <code>hdfs://nn2:8020/target/second/20</code><br/></p>
+
+        <p><br/>When either <code>-update</code> or <code>-overwrite</code> is
+            specified, the <strong>contents</strong> of the source-directories
+            are copied to target, and not the source directories themselves. Thus: </p>
+
+        <p><code>distcp -update hdfs://nn1:8020/source/first hdfs://nn1:8020/source/second hdfs://nn2:8020/target</code></p>
+
+        <p><br/>would yield the following contents in <code>/target</code>: </p>
+
+        <p><code>hdfs://nn2:8020/target/1</code><br/>
+           <code>hdfs://nn2:8020/target/2</code><br/>
+           <code>hdfs://nn2:8020/target/10</code><br/>
+           <code>hdfs://nn2:8020/target/20</code><br/></p>
+
+        <p><br/>By extension, if both source folders contained a file with the same
+        name (say, <code>0</code>), then both sources would map an entry to
+        <code>/target/0</code> at the destination. Rather than to permit this
+        conflict, DistCp will abort.</p>
+
+        <p><br/>Now, consider the following copy operation:</p>
+
+        <p><code>distcp hdfs://nn1:8020/source/first hdfs://nn1:8020/source/second hdfs://nn2:8020/target</code></p>
+
+        <p><br/>With sources/sizes:</p>
+
+        <p><code>hdfs://nn1:8020/source/first/1     32</code><br/>
+           <code>hdfs://nn1:8020/source/first/2     32</code><br/>
+           <code>hdfs://nn1:8020/source/second/10   64</code><br/>
+           <code>hdfs://nn1:8020/source/second/20   32</code><br/></p>
+
+        <p><br/>And destination/sizes:</p>
+
+        <p><code>hdfs://nn2:8020/target/1   32</code><br/>
+           <code>hdfs://nn2:8020/target/10  32</code><br/>
+           <code>hdfs://nn2:8020/target/20  64</code><br/></p>
+
+        <p><br/>Will effect: </p>
+
+        <p><code>hdfs://nn2:8020/target/1   32</code><br/>
+           <code>hdfs://nn2:8020/target/2   32</code><br/>
+           <code>hdfs://nn2:8020/target/10  64</code><br/>
+           <code>hdfs://nn2:8020/target/20  32</code><br/></p>
+
+        <p><br/><code>1</code> is skipped because the file-length and contents match.
+        <code>2</code> is copied because it doesn't exist at the target.
+        <code>10</code> and <code>20</code> are overwritten since the contents
+        don't match the source. </p>
+
+        <p>If <code>-update</code> is used, <code>1</code> is overwritten as well.</p>
+
+    </section>
+  </body>
+
+</document>

+ 139 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/StubContext.java

@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.task.MapContextImpl;
+import org.apache.hadoop.mapreduce.lib.map.WrappedMapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.conf.Configuration;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.io.IOException;
+
+public class StubContext {
+
+  private StubStatusReporter reporter = new StubStatusReporter();
+  private RecordReader<Text, FileStatus> reader;
+  private StubInMemoryWriter writer = new StubInMemoryWriter();
+  private Mapper<Text, FileStatus, Text, Text>.Context mapperContext;
+
+  public StubContext(Configuration conf, RecordReader<Text, FileStatus> reader,
+                     int taskId) throws IOException, InterruptedException {
+
+    WrappedMapper<Text, FileStatus, Text, Text> wrappedMapper
+            = new WrappedMapper<Text, FileStatus, Text, Text>();
+
+    MapContextImpl<Text, FileStatus, Text, Text> contextImpl
+            = new MapContextImpl<Text, FileStatus, Text, Text>(conf,
+            getTaskAttemptID(taskId), reader, writer,
+            null, reporter, null);
+
+    this.reader = reader;
+    this.mapperContext = wrappedMapper.getMapContext(contextImpl);
+  }
+
+  public Mapper<Text, FileStatus, Text, Text>.Context getContext() {
+    return mapperContext;
+  }
+
+  public StatusReporter getReporter() {
+    return reporter;
+  }
+
+  public RecordReader<Text, FileStatus> getReader() {
+    return reader;
+  }
+
+  public StubInMemoryWriter getWriter() {
+    return writer;
+  }
+
+  public static class StubStatusReporter extends StatusReporter {
+
+    private Counters counters = new Counters();
+
+    public StubStatusReporter() {
+	    /*
+      final CounterGroup counterGroup
+              = new CounterGroup("FileInputFormatCounters",
+                                 "FileInputFormatCounters");
+      counterGroup.addCounter(new Counter("BYTES_READ",
+                                          "BYTES_READ",
+                                          0));
+      counters.addGroup(counterGroup);
+      */
+    }
+
+    @Override
+    public Counter getCounter(Enum<?> name) {
+      return counters.findCounter(name);
+    }
+
+    @Override
+    public Counter getCounter(String group, String name) {
+      return counters.findCounter(group, name);
+    }
+
+    @Override
+    public void progress() {}
+
+    @Override
+    public float getProgress() {
+      return 0F;
+    }
+
+    @Override
+    public void setStatus(String status) {}
+  }
+
+
+  public static class StubInMemoryWriter extends RecordWriter<Text, Text> {
+
+    List<Text> keys = new ArrayList<Text>();
+
+    List<Text> values = new ArrayList<Text>();
+
+    @Override
+    public void write(Text key, Text value) throws IOException, InterruptedException {
+      keys.add(key);
+      values.add(value);
+    }
+
+    @Override
+    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
+    }
+
+    public List<Text> keys() {
+      return keys;
+    }
+
+    public List<Text> values() {
+      return values;
+    }
+
+  }
+
+  public static TaskAttemptID getTaskAttemptID(int taskId) {
+    return new TaskAttemptID("", 0, TaskType.MAP, taskId, 0);
+  }
+}

+ 252 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java

@@ -0,0 +1,252 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.tools.util.TestDistCpUtils;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.AfterClass;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.List;
+import java.util.ArrayList;
+
+public class TestCopyListing extends SimpleCopyListing {
+  private static final Log LOG = LogFactory.getLog(TestCopyListing.class);
+
+  private static final Credentials CREDENTIALS = new Credentials();
+
+  private static final Configuration config = new Configuration();
+  private static MiniDFSCluster cluster;
+
+  @BeforeClass
+  public static void create() throws IOException {
+    cluster = new MiniDFSCluster.Builder(config).numDataNodes(1).format(true)
+                                                .build();
+  }
+
+  @AfterClass
+  public static void destroy() {
+    if (cluster != null) {
+      cluster.shutdown();
+    }
+  }
+  
+  public TestCopyListing() {
+    super(config, CREDENTIALS);
+  }
+
+  protected TestCopyListing(Configuration configuration) {
+    super(configuration, CREDENTIALS);
+  }
+
+  @Override
+  protected long getBytesToCopy() {
+    return 0;
+  }
+
+  @Override
+  protected long getNumberOfPaths() {
+    return 0;
+  }
+
+  @Test
+  public void testMultipleSrcToFile() {
+    FileSystem fs = null;
+    try {
+      fs = FileSystem.get(getConf());
+      List<Path> srcPaths = new ArrayList<Path>();
+      srcPaths.add(new Path("/tmp/in/1"));
+      srcPaths.add(new Path("/tmp/in/2"));
+      Path target = new Path("/tmp/out/1");
+      TestDistCpUtils.createFile(fs, "/tmp/in/1");
+      TestDistCpUtils.createFile(fs, "/tmp/in/2");
+      fs.mkdirs(target);
+      DistCpOptions options = new DistCpOptions(srcPaths, target);
+      validatePaths(options);
+      TestDistCpUtils.delete(fs, "/tmp");
+      //No errors
+
+      target = new Path("/tmp/out/1");
+      fs.create(target).close();
+      options = new DistCpOptions(srcPaths, target);
+      try {
+        validatePaths(options);
+        Assert.fail("Invalid inputs accepted");
+      } catch (InvalidInputException ignore) { }
+      TestDistCpUtils.delete(fs, "/tmp");
+
+      srcPaths.clear();
+      srcPaths.add(new Path("/tmp/in/1"));
+      fs.mkdirs(new Path("/tmp/in/1"));
+      target = new Path("/tmp/out/1");
+      fs.create(target).close();
+      options = new DistCpOptions(srcPaths, target);
+      try {
+        validatePaths(options);
+        Assert.fail("Invalid inputs accepted");
+      } catch (InvalidInputException ignore) { }
+      TestDistCpUtils.delete(fs, "/tmp");
+    } catch (IOException e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Test input validation failed");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testDuplicates() {
+    FileSystem fs = null;
+    try {
+      fs = FileSystem.get(getConf());
+      List<Path> srcPaths = new ArrayList<Path>();
+      srcPaths.add(new Path("/tmp/in/*/*"));
+      TestDistCpUtils.createFile(fs, "/tmp/in/1.txt");
+      TestDistCpUtils.createFile(fs, "/tmp/in/src/1.txt");
+      Path target = new Path("/tmp/out");
+      Path listingFile = new Path("/tmp/list");
+      DistCpOptions options = new DistCpOptions(srcPaths, target);
+      CopyListing listing = CopyListing.getCopyListing(getConf(), CREDENTIALS, options);
+      try {
+        listing.buildListing(listingFile, options);
+        Assert.fail("Duplicates not detected");
+      } catch (DuplicateFileException ignore) {
+      }
+    } catch (IOException e) {
+      LOG.error("Exception encountered in test", e);
+      Assert.fail("Test failed " + e.getMessage());
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testBuildListing() {
+    FileSystem fs = null;
+    try {
+      fs = FileSystem.get(getConf());
+      List<Path> srcPaths = new ArrayList<Path>();
+      Path p1 = new Path("/tmp/in/1");
+      Path p2 = new Path("/tmp/in/2");
+      Path p3 = new Path("/tmp/in2/2");
+      Path target = new Path("/tmp/out/1");
+      srcPaths.add(p1.getParent());
+      srcPaths.add(p3.getParent());
+      TestDistCpUtils.createFile(fs, "/tmp/in/1");
+      TestDistCpUtils.createFile(fs, "/tmp/in/2");
+      TestDistCpUtils.createFile(fs, "/tmp/in2/2");
+      fs.mkdirs(target);
+      OutputStream out = fs.create(p1);
+      out.write("ABC".getBytes());
+      out.close();
+
+      out = fs.create(p2);
+      out.write("DEF".getBytes());
+      out.close();
+
+      out = fs.create(p3);
+      out.write("GHIJ".getBytes());
+      out.close();
+
+      Path listingFile = new Path("/tmp/file");
+
+      DistCpOptions options = new DistCpOptions(srcPaths, target);
+      options.setSyncFolder(true);
+      CopyListing listing = new SimpleCopyListing(getConf(), CREDENTIALS);
+      try {
+        listing.buildListing(listingFile, options);
+        Assert.fail("Duplicates not detected");
+      } catch (DuplicateFileException ignore) {
+      }
+      Assert.assertEquals(listing.getBytesToCopy(), 10);
+      Assert.assertEquals(listing.getNumberOfPaths(), 3);
+      TestDistCpUtils.delete(fs, "/tmp");
+
+      try {
+        listing.buildListing(listingFile, options);
+        Assert.fail("Invalid input not detected");
+      } catch (InvalidInputException ignore) {
+      }
+      TestDistCpUtils.delete(fs, "/tmp");
+    } catch (IOException e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Test build listing failed");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testBuildListingForSingleFile() {
+    FileSystem fs = null;
+    String testRootString = "/singleFileListing";
+    Path testRoot = new Path(testRootString);
+    SequenceFile.Reader reader = null;
+    try {
+      fs = FileSystem.get(getConf());
+      if (fs.exists(testRoot))
+        TestDistCpUtils.delete(fs, testRootString);
+
+      Path sourceFile = new Path(testRoot, "/source/foo/bar/source.txt");
+      Path decoyFile  = new Path(testRoot, "/target/moo/source.txt");
+      Path targetFile = new Path(testRoot, "/target/moo/target.txt");
+
+      TestDistCpUtils.createFile(fs, sourceFile.toString());
+      TestDistCpUtils.createFile(fs, decoyFile.toString());
+      TestDistCpUtils.createFile(fs, targetFile.toString());
+
+      List<Path> srcPaths = new ArrayList<Path>();
+      srcPaths.add(sourceFile);
+
+      DistCpOptions options = new DistCpOptions(srcPaths, targetFile);
+      CopyListing listing = new SimpleCopyListing(getConf(), CREDENTIALS);
+
+      final Path listFile = new Path(testRoot, "/tmp/fileList.seq");
+      listing.buildListing(listFile, options);
+
+      reader = new SequenceFile.Reader(fs, listFile, getConf());
+      FileStatus fileStatus = new FileStatus();
+      Text relativePath = new Text();
+      Assert.assertTrue(reader.next(relativePath, fileStatus));
+      Assert.assertTrue(relativePath.toString().equals(""));
+    }
+    catch (Exception e) {
+      Assert.fail("Unexpected exception encountered.");
+      LOG.error("Unexpected exception: ", e);
+    }
+    finally {
+      TestDistCpUtils.delete(fs, testRootString);
+      IOUtils.closeStream(reader);
+    }
+  }
+}

+ 275 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCp.java

@@ -0,0 +1,275 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.mapred.MiniMRCluster;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobSubmissionFiles;
+import org.apache.hadoop.mapreduce.Cluster;
+import org.apache.hadoop.tools.mapred.CopyOutputFormat;
+import org.junit.*;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.io.*;
+
+@Ignore
+public class TestDistCp {
+  private static final Log LOG = LogFactory.getLog(TestDistCp.class);
+  private static List<Path> pathList = new ArrayList<Path>();
+  private static final int FILE_SIZE = 1024;
+
+  private static Configuration configuration;
+  private static MiniDFSCluster cluster;
+  private static MiniMRCluster mrCluster;
+
+  private static final String SOURCE_PATH = "/tmp/source";
+  private static final String TARGET_PATH = "/tmp/target";
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    configuration = getConfigurationForCluster();
+    cluster = new MiniDFSCluster.Builder(configuration).numDataNodes(1)
+                    .format(true).build();
+    System.setProperty("org.apache.hadoop.mapred.TaskTracker", "target/tmp");
+    configuration.set("org.apache.hadoop.mapred.TaskTracker", "target/tmp");
+    System.setProperty("hadoop.log.dir", "target/tmp");
+    configuration.set("hadoop.log.dir", "target/tmp");
+    mrCluster = new MiniMRCluster(1, cluster.getFileSystem().getUri().toString(), 1);
+    Configuration mrConf = mrCluster.createJobConf();
+    final String mrJobTracker = mrConf.get("mapred.job.tracker");
+    configuration.set("mapred.job.tracker", mrJobTracker);
+    final String mrJobTrackerAddress
+            = mrConf.get("mapred.job.tracker.http.address");
+    configuration.set("mapred.job.tracker.http.address", mrJobTrackerAddress);
+  }
+
+  @AfterClass
+  public static void cleanup() {
+    if (mrCluster != null) mrCluster.shutdown();
+    if (cluster != null) cluster.shutdown();
+  }
+
+  private static Configuration getConfigurationForCluster() throws IOException {
+    Configuration configuration = new Configuration();
+    System.setProperty("test.build.data", "target/build/TEST_DISTCP/data");
+    configuration.set("hadoop.log.dir", "target/tmp");
+
+    LOG.debug("fs.default.name  == " + configuration.get("fs.default.name"));
+    LOG.debug("dfs.http.address == " + configuration.get("dfs.http.address"));
+    return configuration;
+  }
+
+  private static void createSourceData() throws Exception {
+    mkdirs(SOURCE_PATH + "/1");
+    mkdirs(SOURCE_PATH + "/2");
+    mkdirs(SOURCE_PATH + "/2/3/4");
+    mkdirs(SOURCE_PATH + "/2/3");
+    mkdirs(SOURCE_PATH + "/5");
+    touchFile(SOURCE_PATH + "/5/6");
+    mkdirs(SOURCE_PATH + "/7");
+    mkdirs(SOURCE_PATH + "/7/8");
+    touchFile(SOURCE_PATH + "/7/8/9");
+  }
+
+  private static void mkdirs(String path) throws Exception {
+    FileSystem fileSystem = cluster.getFileSystem();
+    final Path qualifiedPath = new Path(path).makeQualified(fileSystem.getUri(),
+                                  fileSystem.getWorkingDirectory());
+    pathList.add(qualifiedPath);
+    fileSystem.mkdirs(qualifiedPath);
+  }
+
+  private static void touchFile(String path) throws Exception {
+    FileSystem fs;
+    DataOutputStream outputStream = null;
+    try {
+      fs = cluster.getFileSystem();
+      final Path qualifiedPath = new Path(path).makeQualified(fs.getUri(),
+                                            fs.getWorkingDirectory());
+      final long blockSize = fs.getDefaultBlockSize() * 2;
+      outputStream = fs.create(qualifiedPath, true, 0,
+              (short)(fs.getDefaultReplication()*2),
+              blockSize);
+      outputStream.write(new byte[FILE_SIZE]);
+      pathList.add(qualifiedPath);
+    }
+    finally {
+      IOUtils.cleanup(null, outputStream);
+    }
+  }
+
+  private static void clearState() throws Exception {
+    pathList.clear();
+    cluster.getFileSystem().delete(new Path(TARGET_PATH), true);
+    createSourceData();
+  }
+
+//  @Test
+  public void testUniformSizeDistCp() throws Exception {
+    try {
+      clearState();
+      final FileSystem fileSystem = cluster.getFileSystem();
+      Path sourcePath = new Path(SOURCE_PATH)
+              .makeQualified(fileSystem.getUri(),
+                             fileSystem.getWorkingDirectory());
+      List<Path> sources = new ArrayList<Path>();
+      sources.add(sourcePath);
+
+      Path targetPath = new Path(TARGET_PATH)
+              .makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory());
+      DistCpOptions options = new DistCpOptions(sources, targetPath);
+      options.setAtomicCommit(true);
+      options.setBlocking(false);
+      Job job = new DistCp(configuration, options).execute();
+      Path workDir = CopyOutputFormat.getWorkingDirectory(job);
+      Path finalDir = CopyOutputFormat.getCommitDirectory(job);
+
+      while (!job.isComplete()) {
+        if (cluster.getFileSystem().exists(workDir)) {
+          break;
+        }
+      }
+      job.waitForCompletion(true);
+      Assert.assertFalse(cluster.getFileSystem().exists(workDir));
+      Assert.assertTrue(cluster.getFileSystem().exists(finalDir));
+      Assert.assertFalse(cluster.getFileSystem().exists(
+          new Path(job.getConfiguration().get(DistCpConstants.CONF_LABEL_META_FOLDER))));
+      verifyResults();
+    }
+    catch (Exception e) {
+      LOG.error("Exception encountered", e);
+      Assert.fail("Unexpected exception: " + e.getMessage());
+    }
+  }
+
+//  @Test
+  public void testCleanup() {
+    try {
+      clearState();
+      Path sourcePath = new Path("noscheme:///file");
+      List<Path> sources = new ArrayList<Path>();
+      sources.add(sourcePath);
+
+      final FileSystem fs = cluster.getFileSystem();
+      Path targetPath = new Path(TARGET_PATH)
+              .makeQualified(fs.getUri(), fs.getWorkingDirectory());
+      DistCpOptions options = new DistCpOptions(sources, targetPath);
+
+      Path stagingDir = JobSubmissionFiles.getStagingDir(
+              new Cluster(configuration), configuration);
+      stagingDir.getFileSystem(configuration).mkdirs(stagingDir);
+
+      try {
+        new DistCp(configuration, options).execute();
+      } catch (Throwable t) {
+        Assert.assertEquals(stagingDir.getFileSystem(configuration).
+            listStatus(stagingDir).length, 0);
+      }
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("testCleanup failed " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void testRootPath() throws Exception {
+    try {
+      clearState();
+      List<Path> sources = new ArrayList<Path>();
+      final FileSystem fs = cluster.getFileSystem();
+      sources.add(new Path("/a")
+              .makeQualified(fs.getUri(), fs.getWorkingDirectory()));
+      sources.add(new Path("/b")
+              .makeQualified(fs.getUri(), fs.getWorkingDirectory()));
+      touchFile("/a/a.txt");
+      touchFile("/b/b.txt");
+
+      Path targetPath = new Path("/c")
+              .makeQualified(fs.getUri(), fs.getWorkingDirectory());
+      DistCpOptions options = new DistCpOptions(sources, targetPath);
+      new DistCp(configuration, options).execute();
+      Assert.assertTrue(fs.exists(new Path("/c/a/a.txt")));
+      Assert.assertTrue(fs.exists(new Path("/c/b/b.txt")));
+    }
+    catch (Exception e) {
+      LOG.error("Exception encountered", e);
+      Assert.fail("Unexpected exception: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void testDynamicDistCp() throws Exception {
+    try {
+      clearState();
+      final FileSystem fs = cluster.getFileSystem();
+      Path sourcePath = new Path(SOURCE_PATH)
+              .makeQualified(fs.getUri(), fs.getWorkingDirectory());
+      List<Path> sources = new ArrayList<Path>();
+      sources.add(sourcePath);
+
+      Path targetPath = new Path(TARGET_PATH)
+              .makeQualified(fs.getUri(), fs.getWorkingDirectory());
+      DistCpOptions options = new DistCpOptions(sources, targetPath);
+      options.setCopyStrategy("dynamic");
+
+      options.setAtomicCommit(true);
+      options.setAtomicWorkPath(new Path("/work"));
+      options.setBlocking(false);
+      Job job = new DistCp(configuration, options).execute();
+      Path workDir = CopyOutputFormat.getWorkingDirectory(job);
+      Path finalDir = CopyOutputFormat.getCommitDirectory(job);
+
+      while (!job.isComplete()) {
+        if (fs.exists(workDir)) {
+          break;
+        }
+      }
+      job.waitForCompletion(true);
+      Assert.assertFalse(fs.exists(workDir));
+      Assert.assertTrue(fs.exists(finalDir));
+
+      verifyResults();
+    }
+    catch (Exception e) {
+      LOG.error("Exception encountered", e);
+      Assert.fail("Unexpected exception: " + e.getMessage());
+    }
+  }
+
+  private static void verifyResults() throws Exception {
+    for (Path path : pathList) {
+      FileSystem fs = cluster.getFileSystem();
+
+      Path sourcePath = path.makeQualified(fs.getUri(), fs.getWorkingDirectory());
+      Path targetPath
+              = new Path(sourcePath.toString().replaceAll(SOURCE_PATH, TARGET_PATH));
+
+      Assert.assertTrue(fs.exists(targetPath));
+      Assert.assertEquals(fs.isFile(sourcePath), fs.isFile(targetPath));
+    }
+  }
+}

+ 542 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestFileBasedCopyListing.java

@@ -0,0 +1,542 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.tools.util.TestDistCpUtils;
+import org.apache.hadoop.security.Credentials;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+public class TestFileBasedCopyListing {
+  private static final Log LOG = LogFactory.getLog(TestFileBasedCopyListing.class);
+
+  private static final Credentials CREDENTIALS = new Credentials();
+
+  private static final Configuration config = new Configuration();
+  private static MiniDFSCluster cluster;
+  private static FileSystem fs;
+
+  @BeforeClass
+  public static void create() throws IOException {
+    cluster = new MiniDFSCluster.Builder(config).numDataNodes(1).format(true)
+                                                .build();
+    fs = cluster.getFileSystem();
+    buildExpectedValuesMap();
+  }
+
+  @AfterClass
+  public static void destroy() {
+    if (cluster != null) {
+      cluster.shutdown();
+    }
+  }
+
+  private static Map<String, String> map = new HashMap<String, String>();
+
+  private static void buildExpectedValuesMap() {
+    map.put("/file1", "/tmp/singlefile1/file1");
+    map.put("/file2", "/tmp/singlefile2/file2");
+    map.put("/file3", "/tmp/multifile/file3");
+    map.put("/file4", "/tmp/multifile/file4");
+    map.put("/file5", "/tmp/multifile/file5");
+    map.put("/multifile/file3", "/tmp/multifile/file3");
+    map.put("/multifile/file4", "/tmp/multifile/file4");
+    map.put("/multifile/file5", "/tmp/multifile/file5");
+    map.put("/Ufile3", "/tmp/Umultifile/Ufile3");
+    map.put("/Ufile4", "/tmp/Umultifile/Ufile4");
+    map.put("/Ufile5", "/tmp/Umultifile/Ufile5");
+    map.put("/dir1", "/tmp/singledir/dir1");
+    map.put("/singledir/dir1", "/tmp/singledir/dir1");
+    map.put("/dir2", "/tmp/singledir/dir2");
+    map.put("/singledir/dir2", "/tmp/singledir/dir2");
+    map.put("/Udir1", "/tmp/Usingledir/Udir1");
+    map.put("/Udir2", "/tmp/Usingledir/Udir2");
+    map.put("/dir2/file6", "/tmp/singledir/dir2/file6");
+    map.put("/singledir/dir2/file6", "/tmp/singledir/dir2/file6");
+    map.put("/file7", "/tmp/singledir1/dir3/file7");
+    map.put("/file8", "/tmp/singledir1/dir3/file8");
+    map.put("/file9", "/tmp/singledir1/dir3/file9");
+    map.put("/dir3/file7", "/tmp/singledir1/dir3/file7");
+    map.put("/dir3/file8", "/tmp/singledir1/dir3/file8");
+    map.put("/dir3/file9", "/tmp/singledir1/dir3/file9");
+    map.put("/Ufile7", "/tmp/Usingledir1/Udir3/Ufile7");
+    map.put("/Ufile8", "/tmp/Usingledir1/Udir3/Ufile8");
+    map.put("/Ufile9", "/tmp/Usingledir1/Udir3/Ufile9");
+  }
+
+  @Test
+  public void testSingleFileMissingTarget() {
+    caseSingleFileMissingTarget(false);
+    caseSingleFileMissingTarget(true);
+  }
+
+  private void caseSingleFileMissingTarget(boolean sync) {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/singlefile1/file1");
+      createFiles("/tmp/singlefile1/file1");
+
+      runTest(listFile, target, sync);
+
+      checkResult(listFile, 0);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testSingleFileTargetFile() {
+    caseSingleFileTargetFile(false);
+    caseSingleFileTargetFile(true);
+  }
+
+  private void caseSingleFileTargetFile(boolean sync) {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/singlefile1/file1");
+      createFiles("/tmp/singlefile1/file1", target.toString());
+
+      runTest(listFile, target, sync);
+
+      checkResult(listFile, 0);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testSingleFileTargetDir() {
+    caseSingleFileTargetDir(false);
+    caseSingleFileTargetDir(true);
+  }
+
+  private void caseSingleFileTargetDir(boolean sync) {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/singlefile2/file2");
+      createFiles("/tmp/singlefile2/file2");
+      mkdirs(target.toString());
+
+      runTest(listFile, target, sync);
+
+      checkResult(listFile, 1);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testSingleDirTargetMissing() {
+    caseSingleDirTargetMissing(false);
+    caseSingleDirTargetMissing(true);
+  }
+
+  private void caseSingleDirTargetMissing(boolean sync) {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/singledir");
+      mkdirs("/tmp/singledir/dir1");
+
+      runTest(listFile, target, sync);
+
+      checkResult(listFile, 1);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testSingleDirTargetPresent() {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/singledir");
+      mkdirs("/tmp/singledir/dir1");
+      mkdirs(target.toString());
+
+      runTest(listFile, target);
+
+      checkResult(listFile, 1);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testUpdateSingleDirTargetPresent() {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/Usingledir");
+      mkdirs("/tmp/Usingledir/Udir1");
+      mkdirs(target.toString());
+
+      runTest(listFile, target, true);
+
+      checkResult(listFile, 1);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testMultiFileTargetPresent() {
+    caseMultiFileTargetPresent(false);
+    caseMultiFileTargetPresent(true);
+  }
+
+  private void caseMultiFileTargetPresent(boolean sync) {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5");
+      createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5");
+      mkdirs(target.toString());
+
+      runTest(listFile, target, sync);
+
+      checkResult(listFile, 3);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testMultiFileTargetMissing() {
+    caseMultiFileTargetMissing(false);
+    caseMultiFileTargetMissing(true);
+  }
+
+  private void caseMultiFileTargetMissing(boolean sync) {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5");
+      createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5");
+
+      runTest(listFile, target, sync);
+
+      checkResult(listFile, 3);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testMultiDirTargetPresent() {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/multifile", "/tmp/singledir");
+      createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5");
+      mkdirs(target.toString(), "/tmp/singledir/dir1");
+
+      runTest(listFile, target);
+
+      checkResult(listFile, 4);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testUpdateMultiDirTargetPresent() {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/Umultifile", "/tmp/Usingledir");
+      createFiles("/tmp/Umultifile/Ufile3", "/tmp/Umultifile/Ufile4", "/tmp/Umultifile/Ufile5");
+      mkdirs(target.toString(), "/tmp/Usingledir/Udir1");
+
+      runTest(listFile, target, true);
+
+      checkResult(listFile, 4);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testMultiDirTargetMissing() {
+    caseMultiDirTargetMissing(false);
+    caseMultiDirTargetMissing(true);
+  }
+
+  private void caseMultiDirTargetMissing(boolean sync) {
+
+    try {
+      Path listFile = new Path("/tmp/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/multifile", "/tmp/singledir");
+      createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5");
+      mkdirs("/tmp/singledir/dir1");
+
+      runTest(listFile, target, sync);
+
+      checkResult(listFile, 4);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
+  @Test
+  public void testGlobTargetMissingSingleLevel() {
+    caseGlobTargetMissingSingleLevel(false);
+    caseGlobTargetMissingSingleLevel(true);
+  }
+
+  private void caseGlobTargetMissingSingleLevel(boolean sync) {
+
+    try {
+      Path listFile = new Path("/tmp1/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/*");
+      createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5");
+      createFiles("/tmp/singledir/dir2/file6");
+
+      runTest(listFile, target, sync);
+
+      checkResult(listFile, 5);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+      TestDistCpUtils.delete(fs, "/tmp1");
+    }
+  }
+
+  @Test
+  public void testGlobTargetMissingMultiLevel() {
+    caseGlobTargetMissingMultiLevel(false);
+    caseGlobTargetMissingMultiLevel(true);
+  }
+
+  private void caseGlobTargetMissingMultiLevel(boolean sync) {
+
+    try {
+      Path listFile = new Path("/tmp1/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/*/*");
+      createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5");
+      createFiles("/tmp/singledir1/dir3/file7", "/tmp/singledir1/dir3/file8",
+          "/tmp/singledir1/dir3/file9");
+
+      runTest(listFile, target, sync);
+
+      checkResult(listFile, 6);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+      TestDistCpUtils.delete(fs, "/tmp1");
+    }
+  }
+
+  @Test
+  public void testGlobTargetDirMultiLevel() {
+
+    try {
+      Path listFile = new Path("/tmp1/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/*/*");
+      createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5");
+      createFiles("/tmp/singledir1/dir3/file7", "/tmp/singledir1/dir3/file8",
+          "/tmp/singledir1/dir3/file9");
+      mkdirs(target.toString());
+
+      runTest(listFile, target);
+
+      checkResult(listFile, 6);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+      TestDistCpUtils.delete(fs, "/tmp1");
+    }
+  }
+
+  @Test
+  public void testUpdateGlobTargetDirMultiLevel() {
+
+    try {
+      Path listFile = new Path("/tmp1/listing");
+      Path target = new Path("/tmp/target");
+
+      addEntries(listFile, "/tmp/*/*");
+      createFiles("/tmp/Umultifile/Ufile3", "/tmp/Umultifile/Ufile4", "/tmp/Umultifile/Ufile5");
+      createFiles("/tmp/Usingledir1/Udir3/Ufile7", "/tmp/Usingledir1/Udir3/Ufile8",
+          "/tmp/Usingledir1/Udir3/Ufile9");
+      mkdirs(target.toString());
+
+      runTest(listFile, target, true);
+
+      checkResult(listFile, 6);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing build listing", e);
+      Assert.fail("build listing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+      TestDistCpUtils.delete(fs, "/tmp1");
+    }
+  }
+
+  private void addEntries(Path listFile, String... entries) throws IOException {
+    OutputStream out = fs.create(listFile);
+    try {
+      for (String entry : entries){
+        out.write(entry.getBytes());
+        out.write("\n".getBytes());
+      }
+    } finally {
+      out.close();
+    }
+  }
+
+  private void createFiles(String... entries) throws IOException {
+    for (String entry : entries){
+      OutputStream out = fs.create(new Path(entry));
+      try {
+        out.write(entry.getBytes());
+        out.write("\n".getBytes());
+      } finally {
+        out.close();
+      }
+    }
+  }
+
+  private void mkdirs(String... entries) throws IOException {
+    for (String entry : entries){
+      fs.mkdirs(new Path(entry));
+    }
+  }
+
+  private void runTest(Path listFile, Path target) throws IOException {
+    runTest(listFile, target, true);
+  }
+
+  private void runTest(Path listFile, Path target, boolean sync) throws IOException {
+    CopyListing listing = new FileBasedCopyListing(config, CREDENTIALS);
+    DistCpOptions options = new DistCpOptions(listFile, target);
+    options.setSyncFolder(sync);
+    listing.buildListing(listFile, options);
+  }
+
+  private void checkResult(Path listFile, int count) throws IOException {
+    if (count == 0) {
+      return;
+    }
+
+    int recCount = 0;
+    SequenceFile.Reader reader = new SequenceFile.Reader(config,
+                                            SequenceFile.Reader.file(listFile));
+    try {
+      Text relPath = new Text();
+      FileStatus fileStatus = new FileStatus();
+      while (reader.next(relPath, fileStatus)) {
+        Assert.assertEquals(fileStatus.getPath().toUri().getPath(), map.get(relPath.toString()));
+        recCount++;
+      }
+    } finally {
+      IOUtils.closeStream(reader);
+    }
+    Assert.assertEquals(recCount, count);
+  }
+
+}

+ 135 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestGlobbedCopyListing.java

@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.security.Credentials;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.DataOutputStream;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+public class TestGlobbedCopyListing {
+
+  private static MiniDFSCluster cluster;
+
+  private static final Credentials CREDENTIALS = new Credentials();
+
+  public static Map<String, String> expectedValues = new HashMap<String, String>();
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    cluster = new MiniDFSCluster(new Configuration(), 1, true, null);
+    createSourceData();
+  }
+
+  private static void createSourceData() throws Exception {
+    mkdirs("/tmp/source/1");
+    mkdirs("/tmp/source/2");
+    mkdirs("/tmp/source/2/3");
+    mkdirs("/tmp/source/2/3/4");
+    mkdirs("/tmp/source/5");
+    touchFile("/tmp/source/5/6");
+    mkdirs("/tmp/source/7");
+    mkdirs("/tmp/source/7/8");
+    touchFile("/tmp/source/7/8/9");
+  }
+
+  private static void mkdirs(String path) throws Exception {
+    FileSystem fileSystem = null;
+    try {
+      fileSystem = cluster.getFileSystem();
+      fileSystem.mkdirs(new Path(path));
+      recordInExpectedValues(path);
+    }
+    finally {
+      IOUtils.cleanup(null, fileSystem);
+    }
+  }
+
+  private static void touchFile(String path) throws Exception {
+    FileSystem fileSystem = null;
+    DataOutputStream outputStream = null;
+    try {
+      fileSystem = cluster.getFileSystem();
+      outputStream = fileSystem.create(new Path(path), true, 0);
+      recordInExpectedValues(path);
+    }
+    finally {
+      IOUtils.cleanup(null, fileSystem, outputStream);
+    }
+  }
+
+  private static void recordInExpectedValues(String path) throws Exception {
+    FileSystem fileSystem = cluster.getFileSystem();
+    Path sourcePath = new Path(fileSystem.getUri().toString() + path);
+    expectedValues.put(sourcePath.toString(), DistCpUtils.getRelativePath(
+        new Path("/tmp/source"), sourcePath));
+  }
+
+  @AfterClass
+  public static void tearDown() {
+    cluster.shutdown();
+  }
+
+  @Test
+  public void testRun() throws Exception {
+    final URI uri = cluster.getFileSystem().getUri();
+    final String pathString = uri.toString();
+    Path fileSystemPath = new Path(pathString);
+    Path source = new Path(fileSystemPath.toString() + "/tmp/source");
+    Path target = new Path(fileSystemPath.toString() + "/tmp/target");
+    Path listingPath = new Path(fileSystemPath.toString() + "/tmp/META/fileList.seq");
+    DistCpOptions options = new DistCpOptions(Arrays.asList(source), target);
+
+    new GlobbedCopyListing(new Configuration(), CREDENTIALS).buildListing(listingPath, options);
+
+    verifyContents(listingPath);
+  }
+
+  private void verifyContents(Path listingPath) throws Exception {
+    SequenceFile.Reader reader = new SequenceFile.Reader(cluster.getFileSystem(),
+                                              listingPath, new Configuration());
+    Text key   = new Text();
+    FileStatus value = new FileStatus();
+    Map<String, String> actualValues = new HashMap<String, String>();
+    while (reader.next(key, value)) {
+      actualValues.put(value.getPath().toString(), key.toString());
+    }
+
+    Assert.assertEquals(expectedValues.size(), actualValues.size());
+    for (Map.Entry<String, String> entry : actualValues.entrySet()) {
+      Assert.assertEquals(entry.getValue(), expectedValues.get(entry.getKey()));
+    }
+  }
+}

+ 466 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestIntegration.java

@@ -0,0 +1,466 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.tools.util.TestDistCpUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+public class TestIntegration {
+  private static final Log LOG = LogFactory.getLog(TestIntegration.class);
+
+  private static FileSystem fs;
+
+  private static Path listFile;
+  private static Path target;
+  private static String root;
+
+  private static Configuration getConf() {
+    Configuration conf = new Configuration();
+    conf.set("fs.default.name", "file:///");
+    conf.set("mapred.job.tracker", "local");
+    return conf;
+  }
+
+  @BeforeClass
+  public static void setup() {
+    try {
+      fs = FileSystem.get(getConf());
+      listFile = new Path("target/tmp/listing").makeQualified(fs.getUri(),
+              fs.getWorkingDirectory());
+      target = new Path("target/tmp/target").makeQualified(fs.getUri(),
+              fs.getWorkingDirectory());
+      root = new Path("target/tmp").makeQualified(fs.getUri(),
+              fs.getWorkingDirectory()).toString();
+      TestDistCpUtils.delete(fs, root);
+    } catch (IOException e) {
+      LOG.error("Exception encountered ", e);
+    }
+  }
+
+  @Test
+  public void testSingleFileMissingTarget() {
+    caseSingleFileMissingTarget(false);
+    caseSingleFileMissingTarget(true);
+  }
+
+  private void caseSingleFileMissingTarget(boolean sync) {
+
+    try {
+      addEntries(listFile, "singlefile1/file1");
+      createFiles("singlefile1/file1");
+
+      runTest(listFile, target, sync);
+
+      checkResult(target, 1);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testSingleFileTargetFile() {
+    caseSingleFileTargetFile(false);
+    caseSingleFileTargetFile(true);
+  }
+
+  private void caseSingleFileTargetFile(boolean sync) {
+
+    try {
+      addEntries(listFile, "singlefile1/file1");
+      createFiles("singlefile1/file1", target.toString());
+
+      runTest(listFile, target, sync);
+
+      checkResult(target, 1);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testSingleFileTargetDir() {
+    caseSingleFileTargetDir(false);
+    caseSingleFileTargetDir(true);
+  }
+
+  private void caseSingleFileTargetDir(boolean sync) {
+
+    try {
+      addEntries(listFile, "singlefile2/file2");
+      createFiles("singlefile2/file2");
+      mkdirs(target.toString());
+
+      runTest(listFile, target, sync);
+
+      checkResult(target, 1, "file2");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testSingleDirTargetMissing() {
+    caseSingleDirTargetMissing(false);
+    caseSingleDirTargetMissing(true);
+  }
+
+  private void caseSingleDirTargetMissing(boolean sync) {
+
+    try {
+      addEntries(listFile, "singledir");
+      mkdirs(root + "/singledir/dir1");
+
+      runTest(listFile, target, sync);
+
+      checkResult(target, 1, "dir1");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testSingleDirTargetPresent() {
+
+    try {
+      addEntries(listFile, "singledir");
+      mkdirs(root + "/singledir/dir1");
+      mkdirs(target.toString());
+
+      runTest(listFile, target, false);
+
+      checkResult(target, 1, "singledir/dir1");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testUpdateSingleDirTargetPresent() {
+
+    try {
+      addEntries(listFile, "Usingledir");
+      mkdirs(root + "/Usingledir/Udir1");
+      mkdirs(target.toString());
+
+      runTest(listFile, target, true);
+
+      checkResult(target, 1, "Udir1");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testMultiFileTargetPresent() {
+    caseMultiFileTargetPresent(false);
+    caseMultiFileTargetPresent(true);
+  }
+
+  private void caseMultiFileTargetPresent(boolean sync) {
+
+    try {
+      addEntries(listFile, "multifile/file3", "multifile/file4", "multifile/file5");
+      createFiles("multifile/file3", "multifile/file4", "multifile/file5");
+      mkdirs(target.toString());
+
+      runTest(listFile, target, sync);
+
+      checkResult(target, 3, "file3", "file4", "file5");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testMultiFileTargetMissing() {
+    caseMultiFileTargetMissing(false);
+    caseMultiFileTargetMissing(true);
+  }
+
+  private void caseMultiFileTargetMissing(boolean sync) {
+
+    try {
+      addEntries(listFile, "multifile/file3", "multifile/file4", "multifile/file5");
+      createFiles("multifile/file3", "multifile/file4", "multifile/file5");
+
+      runTest(listFile, target, sync);
+
+      checkResult(target, 3, "file3", "file4", "file5");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testMultiDirTargetPresent() {
+
+    try {
+      addEntries(listFile, "multifile", "singledir");
+      createFiles("multifile/file3", "multifile/file4", "multifile/file5");
+      mkdirs(target.toString(), root + "/singledir/dir1");
+
+      runTest(listFile, target, false);
+
+      checkResult(target, 2, "multifile/file3", "multifile/file4", "multifile/file5", "singledir/dir1");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testUpdateMultiDirTargetPresent() {
+
+    try {
+      addEntries(listFile, "Umultifile", "Usingledir");
+      createFiles("Umultifile/Ufile3", "Umultifile/Ufile4", "Umultifile/Ufile5");
+      mkdirs(target.toString(), root + "/Usingledir/Udir1");
+
+      runTest(listFile, target, true);
+
+      checkResult(target, 4, "Ufile3", "Ufile4", "Ufile5", "Udir1");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testMultiDirTargetMissing() {
+
+    try {
+      addEntries(listFile, "multifile", "singledir");
+      createFiles("multifile/file3", "multifile/file4", "multifile/file5");
+      mkdirs(root + "/singledir/dir1");
+
+      runTest(listFile, target, false);
+
+      checkResult(target, 2, "multifile/file3", "multifile/file4",
+          "multifile/file5", "singledir/dir1");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testUpdateMultiDirTargetMissing() {
+
+    try {
+      addEntries(listFile, "multifile", "singledir");
+      createFiles("multifile/file3", "multifile/file4", "multifile/file5");
+      mkdirs(root + "/singledir/dir1");
+
+      runTest(listFile, target, true);
+
+      checkResult(target, 4, "file3", "file4", "file5", "dir1");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+    }
+  }
+
+  @Test
+  public void testGlobTargetMissingSingleLevel() {
+
+    try {
+      Path listFile = new Path("target/tmp1/listing").makeQualified(fs.getUri(),
+                                fs.getWorkingDirectory());
+      addEntries(listFile, "*");
+      createFiles("multifile/file3", "multifile/file4", "multifile/file5");
+      createFiles("singledir/dir2/file6");
+
+      runTest(listFile, target, false);
+
+      checkResult(target, 2, "multifile/file3", "multifile/file4", "multifile/file5",
+          "singledir/dir2/file6");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+      TestDistCpUtils.delete(fs, "target/tmp1");
+    }
+  }
+
+  @Test
+  public void testUpdateGlobTargetMissingSingleLevel() {
+
+    try {
+      Path listFile = new Path("target/tmp1/listing").makeQualified(fs.getUri(),
+                                  fs.getWorkingDirectory());
+      addEntries(listFile, "*");
+      createFiles("multifile/file3", "multifile/file4", "multifile/file5");
+      createFiles("singledir/dir2/file6");
+
+      runTest(listFile, target, true);
+
+      checkResult(target, 4, "file3", "file4", "file5", "dir2/file6");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while running distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+      TestDistCpUtils.delete(fs, "target/tmp1");
+    }
+  }
+
+  @Test
+  public void testGlobTargetMissingMultiLevel() {
+
+    try {
+      Path listFile = new Path("target/tmp1/listing").makeQualified(fs.getUri(),
+              fs.getWorkingDirectory());
+      addEntries(listFile, "*/*");
+      createFiles("multifile/file3", "multifile/file4", "multifile/file5");
+      createFiles("singledir1/dir3/file7", "singledir1/dir3/file8",
+          "singledir1/dir3/file9");
+
+      runTest(listFile, target, false);
+
+      checkResult(target, 4, "file3", "file4", "file5",
+          "dir3/file7", "dir3/file8", "dir3/file9");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while running distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+      TestDistCpUtils.delete(fs, "target/tmp1");
+    }
+  }
+
+  @Test
+  public void testUpdateGlobTargetMissingMultiLevel() {
+
+    try {
+      Path listFile = new Path("target/tmp1/listing").makeQualified(fs.getUri(),
+              fs.getWorkingDirectory());
+      addEntries(listFile, "*/*");
+      createFiles("multifile/file3", "multifile/file4", "multifile/file5");
+      createFiles("singledir1/dir3/file7", "singledir1/dir3/file8",
+          "singledir1/dir3/file9");
+
+      runTest(listFile, target, true);
+
+      checkResult(target, 6, "file3", "file4", "file5",
+          "file7", "file8", "file9");
+    } catch (IOException e) {
+      LOG.error("Exception encountered while running distcp", e);
+      Assert.fail("distcp failure");
+    } finally {
+      TestDistCpUtils.delete(fs, root);
+      TestDistCpUtils.delete(fs, "target/tmp1");
+    }
+  }
+
+  private void addEntries(Path listFile, String... entries) throws IOException {
+    OutputStream out = fs.create(listFile);
+    try {
+      for (String entry : entries){
+        out.write((root + "/" + entry).getBytes());
+        out.write("\n".getBytes());
+      }
+    } finally {
+      out.close();
+    }
+  }
+
+  private void createFiles(String... entries) throws IOException {
+    for (String entry : entries){
+      OutputStream out = fs.create(new Path(root + "/" + entry));
+      try {
+        out.write((root + "/" + entry).getBytes());
+        out.write("\n".getBytes());
+      } finally {
+        out.close();
+      }
+    }
+  }
+
+  private void mkdirs(String... entries) throws IOException {
+    for (String entry : entries){
+      fs.mkdirs(new Path(entry));
+    }
+  }
+
+  private void runTest(Path listFile, Path target, boolean sync) throws IOException {
+    DistCpOptions options = new DistCpOptions(listFile, target);
+    options.setSyncFolder(sync);
+    try {
+      new DistCp(getConf(), options).execute();
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      throw new IOException(e);
+    }
+  }
+
+  private void checkResult(Path target, int count, String... relPaths) throws IOException {
+    Assert.assertEquals(count, fs.listStatus(target).length);
+    if (relPaths == null || relPaths.length == 0) {
+      Assert.assertTrue(target.toString(), fs.exists(target));
+      return;
+    }
+    for (String relPath : relPaths) {
+      Assert.assertTrue(new Path(target, relPath).toString(), fs.exists(new Path(target, relPath)));
+    }
+  }
+
+}

+ 497 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestOptionsParser.java

@@ -0,0 +1,497 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.tools.DistCpOptions.*;
+import org.apache.hadoop.conf.Configuration;
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+public class TestOptionsParser {
+
+  @Test
+  public void testParseIgnoreFailure() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertFalse(options.shouldIgnoreFailures());
+
+    options = OptionsParser.parse(new String[] {
+        "-i",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldIgnoreFailures());
+  }
+
+  @Test
+  public void testParseOverwrite() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertFalse(options.shouldOverwrite());
+
+    options = OptionsParser.parse(new String[] {
+        "-overwrite",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldOverwrite());
+
+    try {
+      OptionsParser.parse(new String[] {
+          "-update",
+          "-overwrite",
+          "hdfs://localhost:8020/source/first",
+          "hdfs://localhost:8020/target/"});
+      Assert.fail("Update and overwrite aren't allowed together");
+    } catch (IllegalArgumentException ignore) {
+    }
+  }
+
+  @Test
+  public void testLogPath() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertNull(options.getLogPath());
+
+    options = OptionsParser.parse(new String[] {
+        "-log",
+        "hdfs://localhost:8020/logs",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getLogPath(), new Path("hdfs://localhost:8020/logs"));
+  }
+
+  @Test
+  public void testParseBlokcing() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldBlock());
+
+    options = OptionsParser.parse(new String[] {
+        "-async",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertFalse(options.shouldBlock());
+  }
+
+  @Test
+  public void testParsebandwidth() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getMapBandwidth(), DistCpConstants.DEFAULT_BANDWIDTH_MB);
+
+    options = OptionsParser.parse(new String[] {
+        "-bandwidth",
+        "11",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getMapBandwidth(), 11);
+  }
+
+  @Test
+  public void testParseSkipCRC() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertFalse(options.shouldSkipCRC());
+
+    options = OptionsParser.parse(new String[] {
+        "-update",
+        "-skipcrccheck",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldSyncFolder());
+    Assert.assertTrue(options.shouldSkipCRC());
+  }
+
+  @Test
+  public void testParseAtomicCommit() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertFalse(options.shouldAtomicCommit());
+
+    options = OptionsParser.parse(new String[] {
+        "-atomic",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldAtomicCommit());
+
+    try {
+      OptionsParser.parse(new String[] {
+          "-atomic",
+          "-update",
+          "hdfs://localhost:8020/source/first",
+          "hdfs://localhost:8020/target/"});
+      Assert.fail("Atomic and sync folders were allowed");
+    } catch (IllegalArgumentException ignore) { }
+  }
+
+  @Test
+  public void testParseWorkPath() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertNull(options.getAtomicWorkPath());
+
+    options = OptionsParser.parse(new String[] {
+        "-atomic",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertNull(options.getAtomicWorkPath());
+
+    options = OptionsParser.parse(new String[] {
+        "-atomic",
+        "-tmp",
+        "hdfs://localhost:8020/work",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getAtomicWorkPath(), new Path("hdfs://localhost:8020/work"));
+
+    try {
+      OptionsParser.parse(new String[] {
+          "-tmp",
+          "hdfs://localhost:8020/work",
+          "hdfs://localhost:8020/source/first",
+          "hdfs://localhost:8020/target/"});
+      Assert.fail("work path was allowed without -atomic switch");
+    } catch (IllegalArgumentException ignore) {}
+  }
+
+  @Test
+  public void testParseSyncFolders() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertFalse(options.shouldSyncFolder());
+
+    options = OptionsParser.parse(new String[] {
+        "-update",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldSyncFolder());
+  }
+
+  @Test
+  public void testParseDeleteMissing() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertFalse(options.shouldDeleteMissing());
+
+    options = OptionsParser.parse(new String[] {
+        "-update",
+        "-delete",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldSyncFolder());
+    Assert.assertTrue(options.shouldDeleteMissing());
+
+    options = OptionsParser.parse(new String[] {
+        "-overwrite",
+        "-delete",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldOverwrite());
+    Assert.assertTrue(options.shouldDeleteMissing());
+
+    try {
+      OptionsParser.parse(new String[] {
+          "-atomic",
+          "-delete",
+          "hdfs://localhost:8020/source/first",
+          "hdfs://localhost:8020/target/"});
+      Assert.fail("Atomic and delete folders were allowed");
+    } catch (IllegalArgumentException ignore) { }
+  }
+
+  @Test
+  public void testParseSSLConf() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertNull(options.getSslConfigurationFile());
+
+    options = OptionsParser.parse(new String[] {
+        "-mapredSslConf",
+        "/tmp/ssl-client.xml",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getSslConfigurationFile(), "/tmp/ssl-client.xml");
+  }
+
+  @Test
+  public void testParseMaps() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getMaxMaps(), DistCpConstants.DEFAULT_MAPS);
+
+    options = OptionsParser.parse(new String[] {
+        "-m",
+        "1",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getMaxMaps(), 1);
+
+    try {
+      OptionsParser.parse(new String[] {
+          "-m",
+          "hello",
+          "hdfs://localhost:8020/source/first",
+          "hdfs://localhost:8020/target/"});
+      Assert.fail("Non numberic map parsed");
+    } catch (IllegalArgumentException ignore) { }
+
+    try {
+      OptionsParser.parse(new String[] {
+          "-mapredXslConf",
+          "hdfs://localhost:8020/source/first",
+          "hdfs://localhost:8020/target/"});
+      Assert.fail("Non numberic map parsed");
+    } catch (IllegalArgumentException ignore) { }
+  }
+
+  @Test
+  public void testSourceListing() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getSourceFileListing(),
+        new Path("hdfs://localhost:8020/source/first"));
+  }
+
+  @Test
+  public void testSourceListingAndSourcePath() {
+    try {
+      OptionsParser.parse(new String[] {
+          "-f",
+          "hdfs://localhost:8020/source/first",
+          "hdfs://localhost:8020/source/first",
+          "hdfs://localhost:8020/target/"});
+      Assert.fail("Both source listing & source paths allowed");
+    } catch (IllegalArgumentException ignore) {}
+  }
+
+  @Test
+  public void testMissingSourceInfo() {
+    try {
+      OptionsParser.parse(new String[] {
+          "hdfs://localhost:8020/target/"});
+      Assert.fail("Neither source listing not source paths present");
+    } catch (IllegalArgumentException ignore) {}
+  }
+
+  @Test
+  public void testMissingTarget() {
+    try {
+      OptionsParser.parse(new String[] {
+          "-f", "hdfs://localhost:8020/source"});
+      Assert.fail("Missing target allowed");
+    } catch (IllegalArgumentException ignore) {}
+  }
+
+  @Test
+  public void testInvalidArgs() {
+    try {
+      OptionsParser.parse(new String[] {
+          "-m", "-f", "hdfs://localhost:8020/source"});
+      Assert.fail("Missing map value");
+    } catch (IllegalArgumentException ignore) {}
+  }
+
+  @Test
+  public void testToString() {
+    DistCpOptions option = new DistCpOptions(new Path("abc"), new Path("xyz"));
+    String val = "DistCpOptions{atomicCommit=false, syncFolder=false, deleteMissing=false, " +
+        "ignoreFailures=false, maxMaps=20, sslConfigurationFile='null', copyStrategy='uniformsize', " +
+        "sourceFileListing=abc, sourcePaths=null, targetPath=xyz}";
+    Assert.assertEquals(val, option.toString());
+    Assert.assertNotSame(DistCpOptionSwitch.ATOMIC_COMMIT.toString(),
+        DistCpOptionSwitch.ATOMIC_COMMIT.name());
+  }
+
+  @Test
+  public void testCopyStrategy() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "-strategy",
+        "dynamic",
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getCopyStrategy(), "dynamic");
+
+    options = OptionsParser.parse(new String[] {
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getCopyStrategy(), DistCpConstants.UNIFORMSIZE);
+  }
+
+  @Test
+  public void testTargetPath() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertEquals(options.getTargetPath(), new Path("hdfs://localhost:8020/target/"));
+  }
+
+  @Test
+  public void testPreserve() {
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertFalse(options.shouldPreserve(FileAttribute.BLOCKSIZE));
+    Assert.assertFalse(options.shouldPreserve(FileAttribute.REPLICATION));
+    Assert.assertFalse(options.shouldPreserve(FileAttribute.PERMISSION));
+    Assert.assertFalse(options.shouldPreserve(FileAttribute.USER));
+    Assert.assertFalse(options.shouldPreserve(FileAttribute.GROUP));
+
+    options = OptionsParser.parse(new String[] {
+        "-p",
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.BLOCKSIZE));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.REPLICATION));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.USER));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.GROUP));
+
+    options = OptionsParser.parse(new String[] {
+        "-p",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.BLOCKSIZE));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.REPLICATION));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.USER));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.GROUP));
+
+    options = OptionsParser.parse(new String[] {
+        "-pbr",
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.BLOCKSIZE));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.REPLICATION));
+    Assert.assertFalse(options.shouldPreserve(FileAttribute.PERMISSION));
+    Assert.assertFalse(options.shouldPreserve(FileAttribute.USER));
+    Assert.assertFalse(options.shouldPreserve(FileAttribute.GROUP));
+
+    options = OptionsParser.parse(new String[] {
+        "-pbrgup",
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.BLOCKSIZE));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.REPLICATION));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.USER));
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.GROUP));
+
+    options = OptionsParser.parse(new String[] {
+        "-p",
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    int i = 0;
+    Iterator<FileAttribute> attribIterator = options.preserveAttributes();
+    while (attribIterator.hasNext()) {
+      attribIterator.next();
+      i++;
+    }
+    Assert.assertEquals(i, 5);
+
+    try {
+      OptionsParser.parse(new String[] {
+          "-pabc",
+          "-f",
+          "hdfs://localhost:8020/source/first",
+          "hdfs://localhost:8020/target"});
+      Assert.fail("Invalid preserve attribute");
+    }
+    catch (IllegalArgumentException ignore) {}
+    catch (NoSuchElementException ignore) {}
+
+    options = OptionsParser.parse(new String[] {
+        "-f",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    Assert.assertFalse(options.shouldPreserve(FileAttribute.PERMISSION));
+    options.preserve(FileAttribute.PERMISSION);
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION));
+
+    options.preserve(FileAttribute.PERMISSION);
+    Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION));
+  }
+
+  @Test
+  public void testOptionsSwitchAddToConf() {
+    Configuration conf = new Configuration();
+    Assert.assertNull(conf.get(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel()));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.ATOMIC_COMMIT);
+    Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel(), false));
+  }
+
+  @Test
+  public void testOptionsAppendToConf() {
+    Configuration conf = new Configuration();
+    Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false));
+    Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel(), false));
+    DistCpOptions options = OptionsParser.parse(new String[] {
+        "-atomic",
+        "-i",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    options.appendToConf(conf);
+    Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false));
+    Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel(), false));
+    Assert.assertEquals(conf.getInt(DistCpOptionSwitch.BANDWIDTH.getConfigLabel(), -1),
+        DistCpConstants.DEFAULT_BANDWIDTH_MB);
+
+    conf = new Configuration();
+    Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false));
+    Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.DELETE_MISSING.getConfigLabel(), false));
+    Assert.assertEquals(conf.get(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel()), null);
+    options = OptionsParser.parse(new String[] {
+        "-update",
+        "-delete",
+        "-pu",
+        "-bandwidth",
+        "11",
+        "hdfs://localhost:8020/source/first",
+        "hdfs://localhost:8020/target/"});
+    options.appendToConf(conf);
+    Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false));
+    Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.DELETE_MISSING.getConfigLabel(), false));
+    Assert.assertEquals(conf.get(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel()), "U");
+    Assert.assertEquals(conf.getInt(DistCpOptionSwitch.BANDWIDTH.getConfigLabel(), -1), 11);
+  }
+}

+ 419 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyCommitter.java

@@ -0,0 +1,419 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.apache.hadoop.mapreduce.task.JobContextImpl;
+import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
+import org.apache.hadoop.tools.CopyListing;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.DistCpOptions;
+import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+import org.apache.hadoop.tools.GlobbedCopyListing;
+import org.apache.hadoop.tools.util.TestDistCpUtils;
+import org.apache.hadoop.security.Credentials;
+import org.junit.*;
+
+import java.io.IOException;
+import java.util.*;
+
+public class TestCopyCommitter {
+  private static final Log LOG = LogFactory.getLog(TestCopyCommitter.class);
+
+  private static final Random rand = new Random();
+
+  private static final Credentials CREDENTIALS = new Credentials();
+  public static final int PORT = 39737;
+
+
+  private static Configuration config;
+  private static MiniDFSCluster cluster;
+
+  private static Job getJobForClient() throws IOException {
+    Job job = Job.getInstance(new Configuration());
+    job.getConfiguration().set("mapred.job.tracker", "localhost:" + PORT);
+    job.setInputFormatClass(NullInputFormat.class);
+    job.setOutputFormatClass(NullOutputFormat.class);
+    job.setNumReduceTasks(0);
+    return job;
+  }
+
+  @BeforeClass
+  public static void create() throws IOException {
+    config = getJobForClient().getConfiguration();
+    config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, 0);
+    cluster = new MiniDFSCluster.Builder(config).numDataNodes(1).format(true)
+                      .build();
+  }
+
+  @AfterClass
+  public static void destroy() {
+    if (cluster != null) {
+      cluster.shutdown();
+    }
+  }
+
+  @Before
+  public void createMetaFolder() {
+    config.set(DistCpConstants.CONF_LABEL_META_FOLDER, "/meta");
+    Path meta = new Path("/meta");
+    try {
+      cluster.getFileSystem().mkdirs(meta);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while creating meta folder", e);
+      Assert.fail("Unable to create meta folder");
+    }
+  }
+
+  @After
+  public void cleanupMetaFolder() {
+    Path meta = new Path("/meta");
+    try {
+      if (cluster.getFileSystem().exists(meta)) {
+        cluster.getFileSystem().delete(meta, true);
+        Assert.fail("Expected meta folder to be deleted");
+      }
+    } catch (IOException e) {
+      LOG.error("Exception encountered while cleaning up folder", e);
+      Assert.fail("Unable to clean up meta folder");
+    }
+  }
+
+  @Test
+  public void testNoCommitAction() {
+    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
+    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(),
+        taskAttemptContext.getTaskAttemptID().getJobID());
+    try {
+      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
+      committer.commitJob(jobContext);
+      Assert.assertEquals(taskAttemptContext.getStatus(), "Commit Successful");
+
+      //Test for idempotent commit
+      committer.commitJob(jobContext);
+      Assert.assertEquals(taskAttemptContext.getStatus(), "Commit Successful");
+    } catch (IOException e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Commit failed");
+    }
+  }
+
+  @Test
+  public void testPreserveStatus() {
+    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
+    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(),
+        taskAttemptContext.getTaskAttemptID().getJobID());
+    Configuration conf = jobContext.getConfiguration();
+
+
+    String sourceBase;
+    String targetBase;
+    FileSystem fs = null;
+    try {
+      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
+      fs = FileSystem.get(conf);
+      FsPermission sourcePerm = new FsPermission((short) 511);
+      FsPermission initialPerm = new FsPermission((short) 448);
+      sourceBase = TestDistCpUtils.createTestSetup(fs, sourcePerm);
+      targetBase = TestDistCpUtils.createTestSetup(fs, initialPerm);
+
+      DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)),
+          new Path("/out"));
+      options.preserve(FileAttribute.PERMISSION);
+      options.appendToConf(conf);
+
+      CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
+      Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
+      listing.buildListing(listingFile, options);
+
+      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
+
+      committer.commitJob(jobContext);
+      if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
+        Assert.fail("Permission don't match");
+      }
+
+      //Test for idempotent commit
+      committer.commitJob(jobContext);
+      if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
+        Assert.fail("Permission don't match");
+      }
+
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing for preserve status", e);
+      Assert.fail("Preserve status failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp1");
+    }
+
+  }
+
+  @Test
+  public void testDeleteMissing() {
+    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
+    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(),
+        taskAttemptContext.getTaskAttemptID().getJobID());
+    Configuration conf = jobContext.getConfiguration();
+
+    String sourceBase;
+    String targetBase;
+    FileSystem fs = null;
+    try {
+      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
+      fs = FileSystem.get(conf);
+      sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
+      targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
+      String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
+      fs.rename(new Path(targetBaseAdd), new Path(targetBase));
+
+      DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)),
+          new Path("/out"));
+      options.setSyncFolder(true);
+      options.setDeleteMissing(true);
+      options.appendToConf(conf);
+
+      CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
+      Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
+      listing.buildListing(listingFile, options);
+
+      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
+      conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);
+
+      committer.commitJob(jobContext);
+      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
+        Assert.fail("Source and target folders are not in sync");
+      }
+      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
+        Assert.fail("Source and target folders are not in sync");
+      }
+
+      //Test for idempotent commit
+      committer.commitJob(jobContext);
+      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
+        Assert.fail("Source and target folders are not in sync");
+      }
+      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
+        Assert.fail("Source and target folders are not in sync");
+      }
+    } catch (Throwable e) {
+      LOG.error("Exception encountered while testing for delete missing", e);
+      Assert.fail("Delete missing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp1");
+      conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
+    }
+  }
+
+  @Test
+  public void testDeleteMissingFlatInterleavedFiles() {
+    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
+    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(),
+        taskAttemptContext.getTaskAttemptID().getJobID());
+    Configuration conf = jobContext.getConfiguration();
+
+
+    String sourceBase;
+    String targetBase;
+    FileSystem fs = null;
+    try {
+      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
+      fs = FileSystem.get(conf);
+      sourceBase = "/tmp1/" + String.valueOf(rand.nextLong());
+      targetBase = "/tmp1/" + String.valueOf(rand.nextLong());
+      TestDistCpUtils.createFile(fs, sourceBase + "/1");
+      TestDistCpUtils.createFile(fs, sourceBase + "/3");
+      TestDistCpUtils.createFile(fs, sourceBase + "/4");
+      TestDistCpUtils.createFile(fs, sourceBase + "/5");
+      TestDistCpUtils.createFile(fs, sourceBase + "/7");
+      TestDistCpUtils.createFile(fs, sourceBase + "/8");
+      TestDistCpUtils.createFile(fs, sourceBase + "/9");
+
+      TestDistCpUtils.createFile(fs, targetBase + "/2");
+      TestDistCpUtils.createFile(fs, targetBase + "/4");
+      TestDistCpUtils.createFile(fs, targetBase + "/5");
+      TestDistCpUtils.createFile(fs, targetBase + "/7");
+      TestDistCpUtils.createFile(fs, targetBase + "/9");
+      TestDistCpUtils.createFile(fs, targetBase + "/A");
+
+      DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), 
+          new Path("/out"));
+      options.setSyncFolder(true);
+      options.setDeleteMissing(true);
+      options.appendToConf(conf);
+
+      CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
+      Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
+      listing.buildListing(listingFile, options);
+
+      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
+      conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);
+
+      committer.commitJob(jobContext);
+      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
+        Assert.fail("Source and target folders are not in sync");
+      }
+      Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);
+
+      //Test for idempotent commit
+      committer.commitJob(jobContext);
+      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
+        Assert.fail("Source and target folders are not in sync");
+      }
+      Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing for delete missing", e);
+      Assert.fail("Delete missing failure");
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp1");
+      conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
+    }
+
+  }
+
+  @Test
+  public void testAtomicCommitMissingFinal() {
+    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
+    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(),
+        taskAttemptContext.getTaskAttemptID().getJobID());
+    Configuration conf = jobContext.getConfiguration();
+
+    String workPath = "/tmp1/" + String.valueOf(rand.nextLong());
+    String finalPath = "/tmp1/" + String.valueOf(rand.nextLong());
+    FileSystem fs = null;
+    try {
+      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
+      fs = FileSystem.get(conf);
+      fs.mkdirs(new Path(workPath));
+
+      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, workPath);
+      conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, finalPath);
+      conf.setBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, true);
+
+      Assert.assertTrue(fs.exists(new Path(workPath)));
+      Assert.assertFalse(fs.exists(new Path(finalPath)));
+      committer.commitJob(jobContext);
+      Assert.assertFalse(fs.exists(new Path(workPath)));
+      Assert.assertTrue(fs.exists(new Path(finalPath)));
+
+      //Test for idempotent commit
+      committer.commitJob(jobContext);
+      Assert.assertFalse(fs.exists(new Path(workPath)));
+      Assert.assertTrue(fs.exists(new Path(finalPath)));
+
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing for preserve status", e);
+      Assert.fail("Atomic commit failure");
+    } finally {
+      TestDistCpUtils.delete(fs, workPath);
+      TestDistCpUtils.delete(fs, finalPath);
+    }
+  }
+
+  @Test
+  public void testAtomicCommitExistingFinal() {
+    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
+    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(),
+        taskAttemptContext.getTaskAttemptID().getJobID());
+    Configuration conf = jobContext.getConfiguration();
+
+
+    String workPath = "/tmp1/" + String.valueOf(rand.nextLong());
+    String finalPath = "/tmp1/" + String.valueOf(rand.nextLong());
+    FileSystem fs = null;
+    try {
+      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
+      fs = FileSystem.get(conf);
+      fs.mkdirs(new Path(workPath));
+      fs.mkdirs(new Path(finalPath));
+
+      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, workPath);
+      conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, finalPath);
+      conf.setBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, true);
+
+      Assert.assertTrue(fs.exists(new Path(workPath)));
+      Assert.assertTrue(fs.exists(new Path(finalPath)));
+      try {
+        committer.commitJob(jobContext);
+        Assert.fail("Should not be able to atomic-commit to pre-existing path.");
+      } catch(Exception exception) {
+        Assert.assertTrue(fs.exists(new Path(workPath)));
+        Assert.assertTrue(fs.exists(new Path(finalPath)));
+        LOG.info("Atomic-commit Test pass.");
+      }
+
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing for atomic commit.", e);
+      Assert.fail("Atomic commit failure");
+    } finally {
+      TestDistCpUtils.delete(fs, workPath);
+      TestDistCpUtils.delete(fs, finalPath);
+    }
+  }
+
+  private TaskAttemptContext getTaskAttemptContext(Configuration conf) {
+    return new TaskAttemptContextImpl(conf,
+        new TaskAttemptID("200707121733", 1, TaskType.MAP, 1, 1));
+  }
+
+  private boolean checkDirectoryPermissions(FileSystem fs, String targetBase,
+                                            FsPermission sourcePerm) throws IOException {
+    Path base = new Path(targetBase);
+
+    Stack<Path> stack = new Stack<Path>();
+    stack.push(base);
+    while (!stack.isEmpty()) {
+      Path file = stack.pop();
+      if (!fs.exists(file)) continue;
+      FileStatus[] fStatus = fs.listStatus(file);
+      if (fStatus == null || fStatus.length == 0) continue;
+
+      for (FileStatus status : fStatus) {
+        if (status.isDirectory()) {
+          stack.push(status.getPath());
+          Assert.assertEquals(status.getPermission(), sourcePerm);
+        }
+      }
+    }
+    return true;
+  }
+
+  private static class NullInputFormat extends InputFormat {
+    @Override
+    public List getSplits(JobContext context)
+        throws IOException, InterruptedException {
+      return Collections.EMPTY_LIST;
+    }
+
+    @Override
+    public RecordReader createRecordReader(InputSplit split,
+                                           TaskAttemptContext context)
+        throws IOException, InterruptedException {
+      return null;
+    }
+  }
+}

+ 826 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java

@@ -0,0 +1,826 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsAction;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.security.AccessControlException;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.DistCpOptionSwitch;
+import org.apache.hadoop.tools.DistCpOptions;
+import org.apache.hadoop.tools.StubContext;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.security.PrivilegedAction;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.List;
+
+public class TestCopyMapper {
+  private static final Log LOG = LogFactory.getLog(TestCopyMapper.class);
+  private static List<Path> pathList = new ArrayList<Path>();
+  private static int nFiles = 0;
+  private static final int FILE_SIZE = 1024;
+
+  private static MiniDFSCluster cluster;
+
+  private static final String SOURCE_PATH = "/tmp/source";
+  private static final String TARGET_PATH = "/tmp/target";
+
+  private static Configuration configuration;
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    configuration = getConfigurationForCluster();
+    cluster = new MiniDFSCluster.Builder(configuration)
+                .numDataNodes(1)
+                .format(true)
+                .build();
+  }
+
+  private static Configuration getConfigurationForCluster() throws IOException {
+    Configuration configuration = new Configuration();
+    System.setProperty("test.build.data", "target/tmp/build/TEST_COPY_MAPPER/data");
+    configuration.set("hadoop.log.dir", "target/tmp");
+    LOG.debug("fs.default.name  == " + configuration.get("fs.default.name"));
+    LOG.debug("dfs.http.address == " + configuration.get("dfs.http.address"));
+    return configuration;
+  }
+
+  private static Configuration getConfiguration() throws IOException {
+    Configuration configuration = getConfigurationForCluster();
+    final FileSystem fs = cluster.getFileSystem();
+    Path workPath = new Path(TARGET_PATH)
+            .makeQualified(fs.getUri(), fs.getWorkingDirectory());
+    configuration.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH,
+            workPath.toString());
+    configuration.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH,
+            workPath.toString());
+    configuration.setBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(),
+            false);
+    configuration.setBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(),
+            true);
+    configuration.setBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(),
+            true);
+    configuration.set(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel(),
+            "br");
+    return configuration;
+  }
+
+  private static void createSourceData() throws Exception {
+    mkdirs(SOURCE_PATH + "/1");
+    mkdirs(SOURCE_PATH + "/2");
+    mkdirs(SOURCE_PATH + "/2/3/4");
+    mkdirs(SOURCE_PATH + "/2/3");
+    mkdirs(SOURCE_PATH + "/5");
+    touchFile(SOURCE_PATH + "/5/6");
+    mkdirs(SOURCE_PATH + "/7");
+    mkdirs(SOURCE_PATH + "/7/8");
+    touchFile(SOURCE_PATH + "/7/8/9");
+  }
+
+  private static void mkdirs(String path) throws Exception {
+    FileSystem fileSystem = cluster.getFileSystem();
+    final Path qualifiedPath = new Path(path).makeQualified(fileSystem.getUri(),
+                                              fileSystem.getWorkingDirectory());
+    pathList.add(qualifiedPath);
+    fileSystem.mkdirs(qualifiedPath);
+  }
+
+  private static void touchFile(String path) throws Exception {
+    FileSystem fs;
+    DataOutputStream outputStream = null;
+    try {
+      fs = cluster.getFileSystem();
+      final Path qualifiedPath = new Path(path).makeQualified(fs.getUri(),
+                                                      fs.getWorkingDirectory());
+      final long blockSize = fs.getDefaultBlockSize() * 2;
+      outputStream = fs.create(qualifiedPath, true, 0,
+              (short)(fs.getDefaultReplication()*2),
+              blockSize);
+      outputStream.write(new byte[FILE_SIZE]);
+      pathList.add(qualifiedPath);
+      ++nFiles;
+
+      FileStatus fileStatus = fs.getFileStatus(qualifiedPath);
+      System.out.println(fileStatus.getBlockSize());
+      System.out.println(fileStatus.getReplication());
+    }
+    finally {
+      IOUtils.cleanup(null, outputStream);
+    }
+  }
+
+  @Test
+  public void testRun() {
+    try {
+      deleteState();
+      createSourceData();
+
+      FileSystem fs = cluster.getFileSystem();
+      CopyMapper copyMapper = new CopyMapper();
+      StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+      Mapper<Text, FileStatus, Text, Text>.Context context
+              = stubContext.getContext();
+      copyMapper.setup(context);
+
+      for (Path path: pathList) {
+        copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)),
+                fs.getFileStatus(path), context);
+      }
+
+      // Check that the maps worked.
+      for (Path path : pathList) {
+        final Path targetPath = new Path(path.toString()
+                .replaceAll(SOURCE_PATH, TARGET_PATH));
+        Assert.assertTrue(fs.exists(targetPath));
+        Assert.assertTrue(fs.isFile(targetPath) == fs.isFile(path));
+        Assert.assertEquals(fs.getFileStatus(path).getReplication(),
+                fs.getFileStatus(targetPath).getReplication());
+        Assert.assertEquals(fs.getFileStatus(path).getBlockSize(),
+                fs.getFileStatus(targetPath).getBlockSize());
+        Assert.assertTrue(!fs.isFile(targetPath) ||
+                fs.getFileChecksum(targetPath).equals(
+                        fs.getFileChecksum(path)));
+      }
+
+      Assert.assertEquals(pathList.size(),
+              stubContext.getReporter().getCounter(CopyMapper.Counter.COPY).getValue());
+      Assert.assertEquals(nFiles * FILE_SIZE,
+              stubContext.getReporter().getCounter(CopyMapper.Counter.BYTESCOPIED).getValue());
+
+      testCopyingExistingFiles(fs, copyMapper, context);
+      for (Text value : stubContext.getWriter().values()) {
+        Assert.assertTrue(value.toString() + " is not skipped", value.toString().startsWith("SKIP:"));
+      }
+    }
+    catch (Exception e) {
+      LOG.error("Unexpected exception: ", e);
+      Assert.assertTrue(false);
+    }
+  }
+
+  private void testCopyingExistingFiles(FileSystem fs, CopyMapper copyMapper,
+                                        Mapper<Text, FileStatus, Text, Text>.Context context) {
+
+    try {
+      for (Path path : pathList) {
+        copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)),
+                fs.getFileStatus(path), context);
+      }
+
+      Assert.assertEquals(nFiles,
+              context.getCounter(CopyMapper.Counter.SKIP).getValue());
+    }
+    catch (Exception exception) {
+      Assert.assertTrue("Caught unexpected exception:" + exception.getMessage(),
+              false);
+    }
+  }
+
+  @Test
+  public void testMakeDirFailure() {
+    try {
+      deleteState();
+      createSourceData();
+
+      FileSystem fs = cluster.getFileSystem();
+      CopyMapper copyMapper = new CopyMapper();
+      StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+      Mapper<Text, FileStatus, Text, Text>.Context context
+              = stubContext.getContext();
+
+      Configuration configuration = context.getConfiguration();
+      String workPath = new Path("hftp://localhost:1234/*/*/*/?/")
+              .makeQualified(fs.getUri(), fs.getWorkingDirectory()).toString();
+      configuration.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH,
+              workPath);
+      copyMapper.setup(context);
+
+      copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), pathList.get(0))),
+              fs.getFileStatus(pathList.get(0)), context);
+
+      Assert.assertTrue("There should have been an exception.", false);
+    }
+    catch (Exception ignore) {
+    }
+  }
+
+  @Test
+  public void testIgnoreFailures() {
+    doTestIgnoreFailures(true);
+    doTestIgnoreFailures(false);
+  }
+
+  @Test
+  public void testDirToFile() {
+    try {
+      deleteState();
+      createSourceData();
+
+      FileSystem fs = cluster.getFileSystem();
+      CopyMapper copyMapper = new CopyMapper();
+      StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+      Mapper<Text, FileStatus, Text, Text>.Context context
+              = stubContext.getContext();
+
+      mkdirs(SOURCE_PATH + "/src/file");
+      touchFile(TARGET_PATH + "/src/file");
+      try {
+        copyMapper.setup(context);
+        copyMapper.map(new Text("/src/file"),
+            fs.getFileStatus(new Path(SOURCE_PATH + "/src/file")),
+            context);
+      } catch (IOException e) {
+        Assert.assertTrue(e.getMessage().startsWith("Can't replace"));
+      }
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Test failed: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void testPreserve() {
+    try {
+      deleteState();
+      createSourceData();
+
+      UserGroupInformation tmpUser = UserGroupInformation.createRemoteUser("guest");
+
+      final CopyMapper copyMapper = new CopyMapper();
+      
+      final Mapper<Text, FileStatus, Text, Text>.Context context =  tmpUser.
+          doAs(new PrivilegedAction<Mapper<Text, FileStatus, Text, Text>.Context>() {
+        @Override
+        public Mapper<Text, FileStatus, Text, Text>.Context run() {
+          try {
+            StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+            return stubContext.getContext();
+          } catch (Exception e) {
+            LOG.error("Exception encountered ", e);
+            throw new RuntimeException(e);
+          }
+        }
+      });
+
+      EnumSet<DistCpOptions.FileAttribute> preserveStatus =
+          EnumSet.allOf(DistCpOptions.FileAttribute.class);
+
+      context.getConfiguration().set(DistCpConstants.CONF_LABEL_PRESERVE_STATUS,
+        DistCpUtils.packAttributes(preserveStatus));
+
+      touchFile(SOURCE_PATH + "/src/file");
+      mkdirs(TARGET_PATH);
+      cluster.getFileSystem().setPermission(new Path(TARGET_PATH), new FsPermission((short)511));
+
+      final FileSystem tmpFS = tmpUser.doAs(new PrivilegedAction<FileSystem>() {
+        @Override
+        public FileSystem run() {
+          try {
+            return FileSystem.get(configuration);
+          } catch (IOException e) {
+            LOG.error("Exception encountered ", e);
+            Assert.fail("Test failed: " + e.getMessage());
+            throw new RuntimeException("Test ought to fail here");
+          }
+        }
+      });
+
+      tmpUser.doAs(new PrivilegedAction<Integer>() {
+        @Override
+        public Integer run() {
+          try {
+            copyMapper.setup(context);
+            copyMapper.map(new Text("/src/file"),
+                tmpFS.getFileStatus(new Path(SOURCE_PATH + "/src/file")),
+                context);
+            Assert.fail("Expected copy to fail");
+          } catch (AccessControlException e) {
+            Assert.assertTrue("Got exception: " + e.getMessage(), true);
+          } catch (Exception e) {
+            throw new RuntimeException(e);
+          }
+          return null;
+        }
+      });
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Test failed: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void testCopyReadableFiles() {
+    try {
+      deleteState();
+      createSourceData();
+
+      UserGroupInformation tmpUser = UserGroupInformation.createRemoteUser("guest");
+
+      final CopyMapper copyMapper = new CopyMapper();
+
+      final Mapper<Text, FileStatus, Text, Text>.Context context =  tmpUser.
+          doAs(new PrivilegedAction<Mapper<Text, FileStatus, Text, Text>.Context>() {
+        @Override
+        public Mapper<Text, FileStatus, Text, Text>.Context run() {
+          try {
+            StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+            return stubContext.getContext();
+          } catch (Exception e) {
+            LOG.error("Exception encountered ", e);
+            throw new RuntimeException(e);
+          }
+        }
+      });
+
+      touchFile(SOURCE_PATH + "/src/file");
+      mkdirs(TARGET_PATH);
+      cluster.getFileSystem().setPermission(new Path(SOURCE_PATH + "/src/file"),
+          new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ));
+      cluster.getFileSystem().setPermission(new Path(TARGET_PATH), new FsPermission((short)511));
+
+      final FileSystem tmpFS = tmpUser.doAs(new PrivilegedAction<FileSystem>() {
+        @Override
+        public FileSystem run() {
+          try {
+            return FileSystem.get(configuration);
+          } catch (IOException e) {
+            LOG.error("Exception encountered ", e);
+            Assert.fail("Test failed: " + e.getMessage());
+            throw new RuntimeException("Test ought to fail here");
+          }
+        }
+      });
+
+      tmpUser.doAs(new PrivilegedAction<Integer>() {
+        @Override
+        public Integer run() {
+          try {
+            copyMapper.setup(context);
+            copyMapper.map(new Text("/src/file"),
+                tmpFS.getFileStatus(new Path(SOURCE_PATH + "/src/file")),
+                context);
+          } catch (Exception e) {
+            throw new RuntimeException(e);
+          }
+          return null;
+        }
+      });
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Test failed: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void testSkipCopyNoPerms() {
+    try {
+      deleteState();
+      createSourceData();
+
+      UserGroupInformation tmpUser = UserGroupInformation.createRemoteUser("guest");
+
+      final CopyMapper copyMapper = new CopyMapper();
+
+      final StubContext stubContext =  tmpUser.
+          doAs(new PrivilegedAction<StubContext>() {
+        @Override
+        public StubContext run() {
+          try {
+            return new StubContext(getConfiguration(), null, 0);
+          } catch (Exception e) {
+            LOG.error("Exception encountered ", e);
+            throw new RuntimeException(e);
+          }
+        }
+      });
+
+      final Mapper<Text, FileStatus, Text, Text>.Context context = stubContext.getContext();
+      EnumSet<DistCpOptions.FileAttribute> preserveStatus =
+          EnumSet.allOf(DistCpOptions.FileAttribute.class);
+
+      context.getConfiguration().set(DistCpConstants.CONF_LABEL_PRESERVE_STATUS,
+        DistCpUtils.packAttributes(preserveStatus));
+
+      touchFile(SOURCE_PATH + "/src/file");
+      touchFile(TARGET_PATH + "/src/file");
+      cluster.getFileSystem().setPermission(new Path(SOURCE_PATH + "/src/file"),
+          new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ));
+      cluster.getFileSystem().setPermission(new Path(TARGET_PATH + "/src/file"),
+          new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ));
+
+      final FileSystem tmpFS = tmpUser.doAs(new PrivilegedAction<FileSystem>() {
+        @Override
+        public FileSystem run() {
+          try {
+            return FileSystem.get(configuration);
+          } catch (IOException e) {
+            LOG.error("Exception encountered ", e);
+            Assert.fail("Test failed: " + e.getMessage());
+            throw new RuntimeException("Test ought to fail here");
+          }
+        }
+      });
+
+      tmpUser.doAs(new PrivilegedAction<Integer>() {
+        @Override
+        public Integer run() {
+          try {
+            copyMapper.setup(context);
+            copyMapper.map(new Text("/src/file"),
+                tmpFS.getFileStatus(new Path(SOURCE_PATH + "/src/file")),
+                context);
+            Assert.assertEquals(stubContext.getWriter().values().size(), 1);
+            Assert.assertTrue(stubContext.getWriter().values().get(0).toString().startsWith("SKIP"));
+            Assert.assertTrue(stubContext.getWriter().values().get(0).toString().
+                contains(SOURCE_PATH + "/src/file"));
+          } catch (Exception e) {
+            throw new RuntimeException(e);
+          }
+          return null;
+        }
+      });
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Test failed: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void testFailCopyWithAccessControlException() {
+    try {
+      deleteState();
+      createSourceData();
+
+      UserGroupInformation tmpUser = UserGroupInformation.createRemoteUser("guest");
+
+      final CopyMapper copyMapper = new CopyMapper();
+
+      final StubContext stubContext =  tmpUser.
+          doAs(new PrivilegedAction<StubContext>() {
+        @Override
+        public StubContext run() {
+          try {
+            return new StubContext(getConfiguration(), null, 0);
+          } catch (Exception e) {
+            LOG.error("Exception encountered ", e);
+            throw new RuntimeException(e);
+          }
+        }
+      });
+
+      EnumSet<DistCpOptions.FileAttribute> preserveStatus =
+          EnumSet.allOf(DistCpOptions.FileAttribute.class);
+
+      final Mapper<Text, FileStatus, Text, Text>.Context context
+              = stubContext.getContext();
+      
+      context.getConfiguration().set(DistCpConstants.CONF_LABEL_PRESERVE_STATUS,
+        DistCpUtils.packAttributes(preserveStatus));
+
+      touchFile(SOURCE_PATH + "/src/file");
+      OutputStream out = cluster.getFileSystem().create(new Path(TARGET_PATH + "/src/file"));
+      out.write("hello world".getBytes());
+      out.close();
+      cluster.getFileSystem().setPermission(new Path(SOURCE_PATH + "/src/file"),
+          new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ));
+      cluster.getFileSystem().setPermission(new Path(TARGET_PATH + "/src/file"),
+          new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ));
+
+      final FileSystem tmpFS = tmpUser.doAs(new PrivilegedAction<FileSystem>() {
+        @Override
+        public FileSystem run() {
+          try {
+            return FileSystem.get(configuration);
+          } catch (IOException e) {
+            LOG.error("Exception encountered ", e);
+            Assert.fail("Test failed: " + e.getMessage());
+            throw new RuntimeException("Test ought to fail here");
+          }
+        }
+      });
+
+      tmpUser.doAs(new PrivilegedAction<Integer>() {
+        @Override
+        public Integer run() {
+          try {
+            copyMapper.setup(context);
+            copyMapper.map(new Text("/src/file"),
+                tmpFS.getFileStatus(new Path(SOURCE_PATH + "/src/file")),
+                context);
+            Assert.fail("Didn't expect the file to be copied");
+          } catch (AccessControlException ignore) {
+          } catch (Exception e) {
+            if (e.getCause() == null || !(e.getCause() instanceof AccessControlException)) {
+              throw new RuntimeException(e);
+            }
+          }
+          return null;
+        }
+      });
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Test failed: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void testFileToDir() {
+    try {
+      deleteState();
+      createSourceData();
+
+      FileSystem fs = cluster.getFileSystem();
+      CopyMapper copyMapper = new CopyMapper();
+      StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+      Mapper<Text, FileStatus, Text, Text>.Context context
+              = stubContext.getContext();
+
+      touchFile(SOURCE_PATH + "/src/file");
+      mkdirs(TARGET_PATH + "/src/file");
+      try {
+        copyMapper.setup(context);
+        copyMapper.map(new Text("/src/file"),
+            fs.getFileStatus(new Path(SOURCE_PATH + "/src/file")),
+            context);
+      } catch (IOException e) {
+        Assert.assertTrue(e.getMessage().startsWith("Can't replace"));
+      }
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Test failed: " + e.getMessage());
+    }
+  }
+
+  private void doTestIgnoreFailures(boolean ignoreFailures) {
+    try {
+      deleteState();
+      createSourceData();
+
+      FileSystem fs = cluster.getFileSystem();
+      CopyMapper copyMapper = new CopyMapper();
+      StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+      Mapper<Text, FileStatus, Text, Text>.Context context
+              = stubContext.getContext();
+
+      Configuration configuration = context.getConfiguration();
+      configuration.setBoolean(
+              DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(),ignoreFailures);
+      configuration.setBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(),
+              true);
+      configuration.setBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(),
+              true);
+      copyMapper.setup(context);
+
+      for (Path path : pathList) {
+        final FileStatus fileStatus = fs.getFileStatus(path);
+        if (!fileStatus.isDirectory()) {
+          fs.delete(path, true);
+          copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)),
+                  fileStatus, context);
+        }
+      }
+      if (ignoreFailures) {
+        for (Text value : stubContext.getWriter().values()) {
+          Assert.assertTrue(value.toString() + " is not skipped", value.toString().startsWith("FAIL:"));
+        }
+      }
+      Assert.assertTrue("There should have been an exception.", ignoreFailures);
+    }
+    catch (Exception e) {
+      Assert.assertTrue("Unexpected exception: " + e.getMessage(),
+              !ignoreFailures);
+      e.printStackTrace();
+    }
+  }
+
+  private static void deleteState() throws IOException {
+    pathList.clear();
+    nFiles = 0;
+    cluster.getFileSystem().delete(new Path(SOURCE_PATH), true);
+    cluster.getFileSystem().delete(new Path(TARGET_PATH), true);
+  }
+
+  @Test
+  public void testPreserveBlockSizeAndReplication() {
+    testPreserveBlockSizeAndReplicationImpl(true);
+    testPreserveBlockSizeAndReplicationImpl(false);
+  }
+
+  private void testPreserveBlockSizeAndReplicationImpl(boolean preserve){
+    try {
+
+      deleteState();
+      createSourceData();
+
+      FileSystem fs = cluster.getFileSystem();
+      CopyMapper copyMapper = new CopyMapper();
+      StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+      Mapper<Text, FileStatus, Text, Text>.Context context
+              = stubContext.getContext();
+
+      Configuration configuration = context.getConfiguration();
+      EnumSet<DistCpOptions.FileAttribute> fileAttributes
+              = EnumSet.noneOf(DistCpOptions.FileAttribute.class);
+      if (preserve) {
+        fileAttributes.add(DistCpOptions.FileAttribute.BLOCKSIZE);
+        fileAttributes.add(DistCpOptions.FileAttribute.REPLICATION);
+      }
+      configuration.set(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel(),
+              DistCpUtils.packAttributes(fileAttributes));
+
+      copyMapper.setup(context);
+
+      for (Path path : pathList) {
+        final FileStatus fileStatus = fs.getFileStatus(path);
+        copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)),
+                fileStatus, context);
+      }
+
+      // Check that the block-size/replication aren't preserved.
+      for (Path path : pathList) {
+        final Path targetPath = new Path(path.toString()
+                .replaceAll(SOURCE_PATH, TARGET_PATH));
+        final FileStatus source = fs.getFileStatus(path);
+        final FileStatus target = fs.getFileStatus(targetPath);
+        if (!source.isDirectory() ) {
+          Assert.assertTrue(preserve ||
+                  source.getBlockSize() != target.getBlockSize());
+          Assert.assertTrue(preserve ||
+                  source.getReplication() != target.getReplication());
+          Assert.assertTrue(!preserve ||
+                  source.getBlockSize() == target.getBlockSize());
+          Assert.assertTrue(!preserve ||
+                  source.getReplication() == target.getReplication());
+        }
+      }
+    }
+    catch (Exception e) {
+      Assert.assertTrue("Unexpected exception: " + e.getMessage(), false);
+      e.printStackTrace();
+    }
+  }
+
+  private static void changeUserGroup(String user, String group)
+          throws IOException {
+    FileSystem fs = cluster.getFileSystem();
+    FsPermission changedPermission = new FsPermission(
+            FsAction.ALL, FsAction.ALL, FsAction.ALL
+    );
+    for (Path path : pathList)
+      if (fs.isFile(path)) {
+        fs.setOwner(path, user, group);
+        fs.setPermission(path, changedPermission);
+      }
+  }
+
+  /**
+   * If a single file is being copied to a location where the file (of the same
+   * name) already exists, then the file shouldn't be skipped.
+   */
+  @Test
+  public void testSingleFileCopy() {
+    try {
+      deleteState();
+      touchFile(SOURCE_PATH + "/1");
+      Path sourceFilePath = pathList.get(0);
+      Path targetFilePath = new Path(sourceFilePath.toString().replaceAll(
+              SOURCE_PATH, TARGET_PATH));
+      touchFile(targetFilePath.toString());
+
+      FileSystem fs = cluster.getFileSystem();
+      CopyMapper copyMapper = new CopyMapper();
+      StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+      Mapper<Text, FileStatus, Text, Text>.Context context
+              = stubContext.getContext();
+
+      context.getConfiguration().set(
+              DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH,
+              targetFilePath.getParent().toString()); // Parent directory.
+      copyMapper.setup(context);
+
+      final FileStatus sourceFileStatus = fs.getFileStatus(sourceFilePath);
+
+      long before = fs.getFileStatus(targetFilePath).getModificationTime();
+      copyMapper.map(new Text(DistCpUtils.getRelativePath(
+              new Path(SOURCE_PATH), sourceFilePath)), sourceFileStatus, context);
+      long after = fs.getFileStatus(targetFilePath).getModificationTime();
+
+      Assert.assertTrue("File should have been skipped", before == after);
+
+      context.getConfiguration().set(
+              DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH,
+              targetFilePath.toString()); // Specify the file path.
+      copyMapper.setup(context);
+
+      before = fs.getFileStatus(targetFilePath).getModificationTime();
+      try { Thread.sleep(2); } catch (Throwable ignore) {}
+      copyMapper.map(new Text(DistCpUtils.getRelativePath(
+              new Path(SOURCE_PATH), sourceFilePath)), sourceFileStatus, context);
+      after = fs.getFileStatus(targetFilePath).getModificationTime();
+
+      Assert.assertTrue("File should have been overwritten.", before < after);
+
+    } catch (Exception exception) {
+      Assert.fail("Unexpected exception: " + exception.getMessage());
+      exception.printStackTrace();
+    }
+  }
+
+  @Test
+  public void testPreserveUserGroup() {
+    testPreserveUserGroupImpl(true);
+    testPreserveUserGroupImpl(false);
+  }
+
+  private void testPreserveUserGroupImpl(boolean preserve){
+    try {
+
+      deleteState();
+      createSourceData();
+      changeUserGroup("Michael", "Corleone");
+
+      FileSystem fs = cluster.getFileSystem();
+      CopyMapper copyMapper = new CopyMapper();
+      StubContext stubContext = new StubContext(getConfiguration(), null, 0);
+      Mapper<Text, FileStatus, Text, Text>.Context context
+              = stubContext.getContext();
+
+      Configuration configuration = context.getConfiguration();
+      EnumSet<DistCpOptions.FileAttribute> fileAttributes
+              = EnumSet.noneOf(DistCpOptions.FileAttribute.class);
+      if (preserve) {
+        fileAttributes.add(DistCpOptions.FileAttribute.USER);
+        fileAttributes.add(DistCpOptions.FileAttribute.GROUP);
+        fileAttributes.add(DistCpOptions.FileAttribute.PERMISSION);
+      }
+
+      configuration.set(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel(),
+              DistCpUtils.packAttributes(fileAttributes));
+      copyMapper.setup(context);
+
+      for (Path path : pathList) {
+        final FileStatus fileStatus = fs.getFileStatus(path);
+        copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)),
+                fileStatus, context);
+      }
+
+      // Check that the user/group attributes are preserved
+      // (only) as necessary.
+      for (Path path : pathList) {
+        final Path targetPath = new Path(path.toString()
+                .replaceAll(SOURCE_PATH, TARGET_PATH));
+        final FileStatus source = fs.getFileStatus(path);
+        final FileStatus target = fs.getFileStatus(targetPath);
+        if (!source.isDirectory()) {
+          Assert.assertTrue(!preserve || source.getOwner().equals(target.getOwner()));
+          Assert.assertTrue(!preserve || source.getGroup().equals(target.getGroup()));
+          Assert.assertTrue(!preserve || source.getPermission().equals(target.getPermission()));
+          Assert.assertTrue( preserve || !source.getOwner().equals(target.getOwner()));
+          Assert.assertTrue( preserve || !source.getGroup().equals(target.getGroup()));
+          Assert.assertTrue( preserve || !source.getPermission().equals(target.getPermission()));
+          Assert.assertTrue(source.isDirectory() ||
+                  source.getReplication() != target.getReplication());
+        }
+      }
+    }
+    catch (Exception e) {
+      Assert.assertTrue("Unexpected exception: " + e.getMessage(), false);
+      e.printStackTrace();
+    }
+  }
+}

+ 135 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyOutputFormat.java

@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.apache.hadoop.mapreduce.task.JobContextImpl;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.junit.Test;
+import org.junit.Assert;
+
+import java.io.IOException;
+
+public class TestCopyOutputFormat {
+  private static final Log LOG = LogFactory.getLog(TestCopyOutputFormat.class);
+
+  @Test
+  public void testSetCommitDirectory() {
+    try {
+      Job job = Job.getInstance(new Configuration());
+      Assert.assertEquals(null, CopyOutputFormat.getCommitDirectory(job));
+
+      job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, "");
+      Assert.assertEquals(null, CopyOutputFormat.getCommitDirectory(job));
+
+      Path directory = new Path("/tmp/test");
+      CopyOutputFormat.setCommitDirectory(job, directory);
+      Assert.assertEquals(directory, CopyOutputFormat.getCommitDirectory(job));
+      Assert.assertEquals(directory.toString(), job.getConfiguration().
+          get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
+    } catch (IOException e) {
+      LOG.error("Exception encountered while running test", e);
+      Assert.fail("Failed while testing for set Commit Directory");
+    }
+  }
+
+  @Test
+  public void testSetWorkingDirectory() {
+    try {
+      Job job = Job.getInstance(new Configuration());
+      Assert.assertEquals(null, CopyOutputFormat.getWorkingDirectory(job));
+
+      job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, "");
+      Assert.assertEquals(null, CopyOutputFormat.getWorkingDirectory(job));
+
+      Path directory = new Path("/tmp/test");
+      CopyOutputFormat.setWorkingDirectory(job, directory);
+      Assert.assertEquals(directory, CopyOutputFormat.getWorkingDirectory(job));
+      Assert.assertEquals(directory.toString(), job.getConfiguration().
+          get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+    } catch (IOException e) {
+      LOG.error("Exception encountered while running test", e);
+      Assert.fail("Failed while testing for set Working Directory");
+    }
+  }
+
+  @Test
+  public void testGetOutputCommitter() {
+    try {
+      TaskAttemptContext context = new TaskAttemptContextImpl(new Configuration(),
+        new TaskAttemptID("200707121733", 1, TaskType.MAP, 1, 1));
+      context.getConfiguration().set("mapred.output.dir", "/out");
+      Assert.assertTrue(new CopyOutputFormat().getOutputCommitter(context) instanceof CopyCommitter);
+    } catch (IOException e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Unable to get output committer");
+    }
+  }
+
+  @Test
+  public void testCheckOutputSpecs() {
+    try {
+      OutputFormat outputFormat = new CopyOutputFormat();
+      Job job = Job.getInstance(new Configuration());
+      JobID jobID = new JobID("200707121733", 1);
+
+      try {
+        JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
+        outputFormat.checkOutputSpecs(context);
+        Assert.fail("No checking for invalid work/commit path");
+      } catch (IllegalStateException ignore) { }
+
+      CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work"));
+      try {
+        JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
+        outputFormat.checkOutputSpecs(context);
+        Assert.fail("No checking for invalid commit path");
+      } catch (IllegalStateException ignore) { }
+
+      job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, "");
+      CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit"));
+      try {
+        JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
+        outputFormat.checkOutputSpecs(context);
+        Assert.fail("No checking for invalid work path");
+      } catch (IllegalStateException ignore) { }
+
+      CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work"));
+      CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit"));
+      try {
+        JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
+        outputFormat.checkOutputSpecs(context);
+      } catch (IllegalStateException ignore) {
+        Assert.fail("Output spec check failed.");
+      }
+
+    } catch (IOException e) {
+      LOG.error("Exception encountered while testing checkoutput specs", e);
+      Assert.fail("Checkoutput Spec failure");
+    } catch (InterruptedException e) {
+      LOG.error("Exception encountered while testing checkoutput specs", e);
+      Assert.fail("Checkoutput Spec failure");
+    }
+  }
+}

+ 254 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestUniformSizeInputFormat.java

@@ -0,0 +1,254 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.task.JobContextImpl;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.tools.CopyListing;
+import org.apache.hadoop.tools.DistCpOptions;
+import org.apache.hadoop.tools.StubContext;
+import org.apache.hadoop.security.Credentials;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+
+public class TestUniformSizeInputFormat {
+  private static final Log LOG
+                = LogFactory.getLog(TestUniformSizeInputFormat.class);
+
+  private static MiniDFSCluster cluster;
+  private static final int N_FILES = 20;
+  private static final int SIZEOF_EACH_FILE=1024;
+  private static final Random random = new Random();
+  private static int totalFileSize = 0;
+
+  private static final Credentials CREDENTIALS = new Credentials();
+
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    cluster = new MiniDFSCluster.Builder(new Configuration()).numDataNodes(1)
+                                          .format(true).build();
+    totalFileSize = 0;
+
+    for (int i=0; i<N_FILES; ++i)
+      totalFileSize += createFile("/tmp/source/" + String.valueOf(i), SIZEOF_EACH_FILE);
+  }
+
+  private static DistCpOptions getOptions(int nMaps) throws Exception {
+    Path sourcePath = new Path(cluster.getFileSystem().getUri().toString()
+                               + "/tmp/source");
+    Path targetPath = new Path(cluster.getFileSystem().getUri().toString()
+                               + "/tmp/target");
+
+    List<Path> sourceList = new ArrayList<Path>();
+    sourceList.add(sourcePath);
+    final DistCpOptions distCpOptions = new DistCpOptions(sourceList, targetPath);
+    distCpOptions.setMaxMaps(nMaps);
+    return distCpOptions;
+  }
+
+  private static int createFile(String path, int fileSize) throws Exception {
+    FileSystem fileSystem = null;
+    DataOutputStream outputStream = null;
+    try {
+      fileSystem = cluster.getFileSystem();
+      outputStream = fileSystem.create(new Path(path), true, 0);
+      int size = (int) Math.ceil(fileSize + (1 - random.nextFloat()) * fileSize);
+      outputStream.write(new byte[size]);
+      return size;
+    }
+    finally {
+      IOUtils.cleanup(null, fileSystem, outputStream);
+    }
+  }
+
+  @AfterClass
+  public static void tearDown() {
+    cluster.shutdown();
+  }
+
+  public void testGetSplits(int nMaps) throws Exception {
+    DistCpOptions options = getOptions(nMaps);
+    Configuration configuration = new Configuration();
+    configuration.set("mapred.map.tasks",
+                      String.valueOf(options.getMaxMaps()));
+    Path listFile = new Path(cluster.getFileSystem().getUri().toString()
+        + "/tmp/testGetSplits_1/fileList.seq");
+    CopyListing.getCopyListing(configuration, CREDENTIALS, options).
+        buildListing(listFile, options);
+
+    JobContext jobContext = new JobContextImpl(configuration, new JobID());
+    UniformSizeInputFormat uniformSizeInputFormat = new UniformSizeInputFormat();
+    List<InputSplit> splits
+            = uniformSizeInputFormat.getSplits(jobContext);
+
+    List<InputSplit> legacySplits = legacyGetSplits(listFile, nMaps);
+
+    int sizePerMap = totalFileSize/nMaps;
+
+    checkSplits(listFile, splits);
+    checkAgainstLegacy(splits, legacySplits);
+
+    int doubleCheckedTotalSize = 0;
+    int previousSplitSize = -1;
+    for (int i=0; i<splits.size(); ++i) {
+      InputSplit split = splits.get(i);
+      int currentSplitSize = 0;
+      RecordReader<Text, FileStatus> recordReader = uniformSizeInputFormat.createRecordReader(
+              split, null);
+      StubContext stubContext = new StubContext(jobContext.getConfiguration(),
+                                                recordReader, 0);
+      final TaskAttemptContext taskAttemptContext
+         = stubContext.getContext();
+      recordReader.initialize(split, taskAttemptContext);
+      while (recordReader.nextKeyValue()) {
+        Path sourcePath = recordReader.getCurrentValue().getPath();
+        FileSystem fs = sourcePath.getFileSystem(configuration);
+        FileStatus fileStatus [] = fs.listStatus(sourcePath);
+        Assert.assertEquals(fileStatus.length, 1);
+        currentSplitSize += fileStatus[0].getLen();
+      }
+      Assert.assertTrue(
+           previousSplitSize == -1
+               || Math.abs(currentSplitSize - previousSplitSize) < 0.1*sizePerMap
+               || i == splits.size()-1);
+
+      doubleCheckedTotalSize += currentSplitSize;
+    }
+
+    Assert.assertEquals(totalFileSize, doubleCheckedTotalSize);
+  }
+
+  // From
+  // http://svn.apache.org/repos/asf/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/DistCp.java
+  private List<InputSplit> legacyGetSplits(Path listFile, int numSplits)
+      throws IOException {
+
+    FileSystem fs = cluster.getFileSystem();
+    FileStatus srcst = fs.getFileStatus(listFile);
+    Configuration conf = fs.getConf();
+
+    ArrayList<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
+    FileStatus value = new FileStatus();
+    Text key = new Text();
+    final long targetsize = totalFileSize / numSplits;
+    long pos = 0L;
+    long last = 0L;
+    long acc = 0L;
+    long cbrem = srcst.getLen();
+    SequenceFile.Reader sl = null;
+
+    LOG.info("Average bytes per map: " + targetsize +
+        ", Number of maps: " + numSplits + ", total size: " + totalFileSize);
+
+    try {
+      sl = new SequenceFile.Reader(conf, SequenceFile.Reader.file(listFile));
+      for (; sl.next(key, value); last = sl.getPosition()) {
+        // if adding this split would put this split past the target size,
+        // cut the last split and put this next file in the next split.
+        if (acc + value.getLen() > targetsize && acc != 0) {
+          long splitsize = last - pos;
+          FileSplit fileSplit = new FileSplit(listFile, pos, splitsize, null);
+          LOG.info ("Creating split : " + fileSplit + ", bytes in split: " + splitsize);
+          splits.add(fileSplit);
+          cbrem -= splitsize;
+          pos = last;
+          acc = 0L;
+        }
+        acc += value.getLen();
+      }
+    }
+    finally {
+      IOUtils.closeStream(sl);
+    }
+    if (cbrem != 0) {
+      FileSplit fileSplit = new FileSplit(listFile, pos, cbrem, null);
+      LOG.info ("Creating split : " + fileSplit + ", bytes in split: " + cbrem);
+      splits.add(fileSplit);
+    }
+
+    return splits;
+  }
+
+  private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
+    long lastEnd = 0;
+
+    //Verify if each split's start is matching with the previous end and
+    //we are not missing anything
+    for (InputSplit split : splits) {
+      FileSplit fileSplit = (FileSplit) split;
+      long start = fileSplit.getStart();
+      Assert.assertEquals(lastEnd, start);
+      lastEnd = start + fileSplit.getLength();
+    }
+
+    //Verify there is nothing more to read from the input file
+    SequenceFile.Reader reader
+            = new SequenceFile.Reader(cluster.getFileSystem().getConf(),
+                    SequenceFile.Reader.file(listFile));
+
+    try {
+      reader.seek(lastEnd);
+      FileStatus srcFileStatus = new FileStatus();
+      Text srcRelPath = new Text();
+      Assert.assertFalse(reader.next(srcRelPath, srcFileStatus));
+    } finally {
+      IOUtils.closeStream(reader);
+    }
+  }
+
+  private void checkAgainstLegacy(List<InputSplit> splits,
+                                  List<InputSplit> legacySplits)
+      throws IOException, InterruptedException {
+
+    Assert.assertEquals(legacySplits.size(), splits.size());
+    for (int index = 0; index < splits.size(); index++) {
+      FileSplit fileSplit = (FileSplit) splits.get(index);
+      FileSplit legacyFileSplit = (FileSplit) legacySplits.get(index);
+      Assert.assertEquals(fileSplit.getStart(), legacyFileSplit.getStart());
+    }
+  }
+
+  @Test
+  public void testGetSplits() throws Exception {
+    testGetSplits(9);
+    for (int i=1; i<N_FILES; ++i)
+      testGetSplits(i);
+  }
+}

+ 162 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/lib/TestDynamicInputFormat.java

@@ -0,0 +1,162 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred.lib;
+
+import junit.framework.Assert;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.task.JobContextImpl;
+import org.apache.hadoop.tools.CopyListing;
+import org.apache.hadoop.tools.DistCpOptions;
+import org.apache.hadoop.tools.StubContext;
+import org.apache.hadoop.security.Credentials;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.DataOutputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+public class TestDynamicInputFormat {
+  private static final Log LOG = LogFactory.getLog(TestDynamicInputFormat.class);
+  private static MiniDFSCluster cluster;
+  private static final int N_FILES = 1000;
+  private static final int NUM_SPLITS = 7;
+
+  private static final Credentials CREDENTIALS = new Credentials();
+
+  private static List<String> expectedFilePaths = new ArrayList<String>(N_FILES);
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    cluster = new MiniDFSCluster.Builder(getConfigurationForCluster())
+                  .numDataNodes(1).format(true).build();
+
+    for (int i=0; i<N_FILES; ++i)
+      createFile("/tmp/source/" + String.valueOf(i));
+
+  }
+
+  private static Configuration getConfigurationForCluster() {
+    Configuration configuration = new Configuration();
+    System.setProperty("test.build.data",
+                       "target/tmp/build/TEST_DYNAMIC_INPUT_FORMAT/data");
+    configuration.set("hadoop.log.dir", "target/tmp");
+    LOG.debug("fs.default.name  == " + configuration.get("fs.default.name"));
+    LOG.debug("dfs.http.address == " + configuration.get("dfs.http.address"));
+    return configuration;
+  }
+
+  private static DistCpOptions getOptions() throws Exception {
+    Path sourcePath = new Path(cluster.getFileSystem().getUri().toString()
+            + "/tmp/source");
+    Path targetPath = new Path(cluster.getFileSystem().getUri().toString()
+            + "/tmp/target");
+
+    List<Path> sourceList = new ArrayList<Path>();
+    sourceList.add(sourcePath);
+    DistCpOptions options = new DistCpOptions(sourceList, targetPath);
+    options.setMaxMaps(NUM_SPLITS);
+    return options;
+  }
+
+  private static void createFile(String path) throws Exception {
+    FileSystem fileSystem = null;
+    DataOutputStream outputStream = null;
+    try {
+      fileSystem = cluster.getFileSystem();
+      outputStream = fileSystem.create(new Path(path), true, 0);
+      expectedFilePaths.add(fileSystem.listStatus(
+                                    new Path(path))[0].getPath().toString());
+    }
+    finally {
+      IOUtils.cleanup(null, fileSystem, outputStream);
+    }
+  }
+
+  @AfterClass
+  public static void tearDown() {
+    cluster.shutdown();
+  }
+
+  @Test
+  public void testGetSplits() throws Exception {
+    DistCpOptions options = getOptions();
+    Configuration configuration = new Configuration();
+    configuration.set("mapred.map.tasks",
+                      String.valueOf(options.getMaxMaps()));
+    CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(
+            new Path(cluster.getFileSystem().getUri().toString()
+                    +"/tmp/testDynInputFormat/fileList.seq"), options);
+
+    JobContext jobContext = new JobContextImpl(configuration, new JobID());
+    DynamicInputFormat<Text, FileStatus> inputFormat =
+        new DynamicInputFormat<Text, FileStatus>();
+    List<InputSplit> splits = inputFormat.getSplits(jobContext);
+
+    int nFiles = 0;
+    int taskId = 0;
+
+    for (InputSplit split : splits) {
+      RecordReader<Text, FileStatus> recordReader =
+           inputFormat.createRecordReader(split, null);
+      StubContext stubContext = new StubContext(jobContext.getConfiguration(),
+                                                recordReader, taskId);
+      final TaskAttemptContext taskAttemptContext
+         = stubContext.getContext();
+      
+      recordReader.initialize(splits.get(0), taskAttemptContext);
+      float previousProgressValue = 0f;
+      while (recordReader.nextKeyValue()) {
+        FileStatus fileStatus = recordReader.getCurrentValue();
+        String source = fileStatus.getPath().toString();
+        System.out.println(source);
+        Assert.assertTrue(expectedFilePaths.contains(source));
+        final float progress = recordReader.getProgress();
+        Assert.assertTrue(progress >= previousProgressValue);
+        Assert.assertTrue(progress >= 0.0f);
+        Assert.assertTrue(progress <= 1.0f);
+        previousProgressValue = progress;
+        ++nFiles;
+      }
+      Assert.assertTrue(recordReader.getProgress() == 1.0f);
+
+      ++taskId;
+    }
+
+    Assert.assertEquals(expectedFilePaths.size(), nFiles);
+  }
+
+  @Test
+  public void testGetSplitRatio() throws Exception {
+    Assert.assertEquals(1, DynamicInputFormat.getSplitRatio(1, 1000000000));
+    Assert.assertEquals(2, DynamicInputFormat.getSplitRatio(11000000, 10));
+    Assert.assertEquals(4, DynamicInputFormat.getSplitRatio(30, 700));
+    Assert.assertEquals(2, DynamicInputFormat.getSplitRatio(30, 200));
+  }
+}

+ 220 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestDistCpUtils.java

@@ -0,0 +1,220 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.util;
+
+import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.BeforeClass;
+import org.junit.AfterClass;
+
+import java.util.EnumSet;
+import java.util.Random;
+import java.util.Stack;
+import java.io.IOException;
+import java.io.OutputStream;
+
+public class TestDistCpUtils {
+  private static final Log LOG = LogFactory.getLog(TestDistCpUtils.class);
+
+  private static final Configuration config = new Configuration();
+  private static MiniDFSCluster cluster;
+
+  @BeforeClass
+  public static void create() throws IOException {
+    cluster = new MiniDFSCluster.Builder(config).numDataNodes(1).format(true)
+                                                .build(); 
+  }
+
+  @AfterClass
+  public static void destroy() {
+    if (cluster != null) {
+      cluster.shutdown();
+    }
+  }
+
+  @Test
+  public void testGetRelativePathRoot() {
+    Path root = new Path("/tmp/abc");
+    Path child = new Path("/tmp/abc/xyz/file");
+    Assert.assertEquals(DistCpUtils.getRelativePath(root, child), "/xyz/file");
+
+    root = new Path("/");
+    child = new Path("/a");
+    Assert.assertEquals(DistCpUtils.getRelativePath(root, child), "/a");
+  }
+
+  @Test
+  public void testPackAttributes() {
+    EnumSet<FileAttribute> attributes = EnumSet.noneOf(FileAttribute.class);
+    Assert.assertEquals(DistCpUtils.packAttributes(attributes), "");
+
+    attributes.add(FileAttribute.REPLICATION);
+    Assert.assertEquals(DistCpUtils.packAttributes(attributes), "R");
+    Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("R"));
+
+    attributes.add(FileAttribute.BLOCKSIZE);
+    Assert.assertEquals(DistCpUtils.packAttributes(attributes), "RB");
+    Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("RB"));
+
+    attributes.add(FileAttribute.USER);
+    Assert.assertEquals(DistCpUtils.packAttributes(attributes), "RBU");
+    Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("RBU"));
+
+    attributes.add(FileAttribute.GROUP);
+    Assert.assertEquals(DistCpUtils.packAttributes(attributes), "RBUG");
+    Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("RBUG"));
+
+    attributes.add(FileAttribute.PERMISSION);
+    Assert.assertEquals(DistCpUtils.packAttributes(attributes), "RBUGP");
+    Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("RBUGP"));
+  }
+
+  @Test
+  public void testPreserve() {
+    try {
+      FileSystem fs = FileSystem.get(config);
+      EnumSet<FileAttribute> attributes = EnumSet.noneOf(FileAttribute.class);
+
+
+      Path path = new Path("/tmp/abc");
+      Path src = new Path("/tmp/src");
+      fs.mkdirs(path);
+      fs.mkdirs(src);
+      FileStatus srcStatus = fs.getFileStatus(src);
+
+      FsPermission noPerm = new FsPermission((short) 0);
+      fs.setPermission(path, noPerm);
+      fs.setOwner(path, "nobody", "nobody");
+
+      DistCpUtils.preserve(fs, path, srcStatus, attributes);
+      FileStatus target = fs.getFileStatus(path);
+      Assert.assertEquals(target.getPermission(), noPerm);
+      Assert.assertEquals(target.getOwner(), "nobody");
+      Assert.assertEquals(target.getGroup(), "nobody");
+
+      attributes.add(FileAttribute.PERMISSION);
+      DistCpUtils.preserve(fs, path, srcStatus, attributes);
+      target = fs.getFileStatus(path);
+      Assert.assertEquals(target.getPermission(), srcStatus.getPermission());
+      Assert.assertEquals(target.getOwner(), "nobody");
+      Assert.assertEquals(target.getGroup(), "nobody");
+
+      attributes.add(FileAttribute.GROUP);
+      attributes.add(FileAttribute.USER);
+      DistCpUtils.preserve(fs, path, srcStatus, attributes);
+      target = fs.getFileStatus(path);
+      Assert.assertEquals(target.getPermission(), srcStatus.getPermission());
+      Assert.assertEquals(target.getOwner(), srcStatus.getOwner());
+      Assert.assertEquals(target.getGroup(), srcStatus.getGroup());
+
+      fs.delete(path, true);
+      fs.delete(src, true);
+    } catch (IOException e) {
+      LOG.error("Exception encountered ", e);
+      Assert.fail("Preserve test failure");
+    }
+  }
+
+  private static Random rand = new Random();
+
+  public static String createTestSetup(FileSystem fs) throws IOException {
+    return createTestSetup("/tmp1", fs, FsPermission.getDefault());
+  }
+  
+  public static String createTestSetup(FileSystem fs,
+                                       FsPermission perm) throws IOException {
+    return createTestSetup("/tmp1", fs, perm);
+  }
+
+  public static String createTestSetup(String baseDir,
+                                       FileSystem fs,
+                                       FsPermission perm) throws IOException {
+    String base = getBase(baseDir);
+    fs.mkdirs(new Path(base + "/newTest/hello/world1"));
+    fs.mkdirs(new Path(base + "/newTest/hello/world2/newworld"));
+    fs.mkdirs(new Path(base + "/newTest/hello/world3/oldworld"));
+    fs.setPermission(new Path(base + "/newTest"), perm);
+    fs.setPermission(new Path(base + "/newTest/hello"), perm);
+    fs.setPermission(new Path(base + "/newTest/hello/world1"), perm);
+    fs.setPermission(new Path(base + "/newTest/hello/world2"), perm);
+    fs.setPermission(new Path(base + "/newTest/hello/world2/newworld"), perm);
+    fs.setPermission(new Path(base + "/newTest/hello/world3"), perm);
+    fs.setPermission(new Path(base + "/newTest/hello/world3/oldworld"), perm);
+    createFile(fs, base + "/newTest/1");
+    createFile(fs, base + "/newTest/hello/2");
+    createFile(fs, base + "/newTest/hello/world3/oldworld/3");
+    createFile(fs, base + "/newTest/hello/world2/4");
+    return base;
+  }
+
+  private static String getBase(String base) {
+    String location = String.valueOf(rand.nextLong());
+    return base + "/" + location;
+  }
+
+  public static void delete(FileSystem fs, String path) {
+    try {
+      if (fs != null) {
+        if (path != null) {
+          fs.delete(new Path(path), true);
+        }
+      }
+    } catch (IOException e) {
+      LOG.warn("Exception encountered ", e);
+    }
+  }
+
+  public static void createFile(FileSystem fs, String filePath) throws IOException {
+    OutputStream out = fs.create(new Path(filePath));
+    IOUtils.closeStream(out);
+  }
+
+  public static boolean checkIfFoldersAreInSync(FileSystem fs, String targetBase, String sourceBase)
+      throws IOException {
+    Path base = new Path(targetBase);
+
+     Stack<Path> stack = new Stack<Path>();
+     stack.push(base);
+     while (!stack.isEmpty()) {
+       Path file = stack.pop();
+       if (!fs.exists(file)) continue;
+       FileStatus[] fStatus = fs.listStatus(file);
+       if (fStatus == null || fStatus.length == 0) continue;
+
+       for (FileStatus status : fStatus) {
+         if (status.isDirectory()) {
+           stack.push(status.getPath());
+         }
+         Assert.assertTrue(fs.exists(new Path(sourceBase + "/" +
+             DistCpUtils.getRelativePath(new Path(targetBase), status.getPath()))));
+       }
+     }
+     return true;
+  }
+}

+ 81 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestRetriableCommand.java

@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.util;
+
+import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.io.retry.RetryPolicies;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.concurrent.TimeUnit;
+
+public class TestRetriableCommand {
+
+  private static class MyRetriableCommand extends RetriableCommand {
+
+    private int succeedAfter;
+    private int retryCount = 0;
+
+    public MyRetriableCommand(int succeedAfter) {
+      super("MyRetriableCommand");
+      this.succeedAfter = succeedAfter;
+    }
+
+    public MyRetriableCommand(int succeedAfter, RetryPolicy retryPolicy) {
+      super("MyRetriableCommand", retryPolicy);
+      this.succeedAfter = succeedAfter;
+    }
+
+    @Override
+    protected Object doExecute(Object... arguments) throws Exception {
+      if (++retryCount < succeedAfter)
+        throw new Exception("Transient failure#" + retryCount);
+      return 0;
+    }
+  }
+
+  @Test
+  public void testRetriableCommand() {
+    try {
+      new MyRetriableCommand(5).execute(0);
+      Assert.assertTrue(false);
+    }
+    catch (Exception e) {
+      Assert.assertTrue(true);
+    }
+
+
+    try {
+      new MyRetriableCommand(3).execute(0);
+      Assert.assertTrue(true);
+    }
+    catch (Exception e) {
+      Assert.assertTrue(false);
+    }
+
+    try {
+      new MyRetriableCommand(5, RetryPolicies.
+          retryUpToMaximumCountWithFixedSleep(5, 0, TimeUnit.MILLISECONDS)).execute(0);
+      Assert.assertTrue(true);
+    }
+    catch (Exception e) {
+      Assert.assertTrue(false);
+    }
+  }
+}

+ 157 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestThrottledInputStream.java

@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.util;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.*;
+
+public class TestThrottledInputStream {
+  private static final Log LOG = LogFactory.getLog(TestThrottledInputStream.class);
+  private static final int BUFF_SIZE = 1024;
+
+  private enum CB {ONE_C, BUFFER, BUFF_OFFSET}
+
+  @Test
+  public void testRead() {
+    File tmpFile;
+    File outFile;
+    try {
+      tmpFile = createFile(1024);
+      outFile = createFile();
+
+      tmpFile.deleteOnExit();
+      outFile.deleteOnExit();
+
+      long maxBandwidth = copyAndAssert(tmpFile, outFile, 0, 1, -1, CB.BUFFER);
+
+      copyAndAssert(tmpFile, outFile, maxBandwidth, 20, 0, CB.BUFFER);
+/*
+      copyAndAssert(tmpFile, outFile, maxBandwidth, 10, 0, CB.BUFFER);
+      copyAndAssert(tmpFile, outFile, maxBandwidth, 50, 0, CB.BUFFER);
+*/
+
+      copyAndAssert(tmpFile, outFile, maxBandwidth, 20, 0, CB.BUFF_OFFSET);
+/*
+      copyAndAssert(tmpFile, outFile, maxBandwidth, 10, 0, CB.BUFF_OFFSET);
+      copyAndAssert(tmpFile, outFile, maxBandwidth, 50, 0, CB.BUFF_OFFSET);
+*/
+
+      copyAndAssert(tmpFile, outFile, maxBandwidth, 20, 0, CB.ONE_C);
+/*
+      copyAndAssert(tmpFile, outFile, maxBandwidth, 10, 0, CB.ONE_C);
+      copyAndAssert(tmpFile, outFile, maxBandwidth, 50, 0, CB.ONE_C);
+*/
+    } catch (IOException e) {
+      LOG.error("Exception encountered ", e);
+    }
+  }
+
+  private long copyAndAssert(File tmpFile, File outFile,
+                             long maxBandwidth, float factor,
+                             int sleepTime, CB flag) throws IOException {
+    long bandwidth;
+    ThrottledInputStream in;
+    long maxBPS = (long) (maxBandwidth / factor);
+
+    if (maxBandwidth == 0) {
+      in = new ThrottledInputStream(new FileInputStream(tmpFile));
+    } else {
+      in = new ThrottledInputStream(new FileInputStream(tmpFile), maxBPS);
+    }
+    OutputStream out = new FileOutputStream(outFile);
+    try {
+      if (flag == CB.BUFFER) {
+        copyBytes(in, out, BUFF_SIZE);
+      } else if (flag == CB.BUFF_OFFSET){
+        copyBytesWithOffset(in, out, BUFF_SIZE);
+      } else {
+        copyByteByByte(in, out);
+      }
+
+      LOG.info(in);
+      bandwidth = in.getBytesPerSec();
+      Assert.assertEquals(in.getTotalBytesRead(), tmpFile.length());
+      Assert.assertTrue(in.getBytesPerSec() > maxBandwidth / (factor * 1.2));
+      Assert.assertTrue(in.getTotalSleepTime() >  sleepTime || in.getBytesPerSec() <= maxBPS);
+    } finally {
+      IOUtils.closeStream(in);
+      IOUtils.closeStream(out);
+    }
+    return bandwidth;
+  }
+
+  private static void copyBytesWithOffset(InputStream in, OutputStream out, int buffSize)
+    throws IOException {
+
+    byte buf[] = new byte[buffSize];
+    int bytesRead = in.read(buf, 0, buffSize);
+    while (bytesRead >= 0) {
+      out.write(buf, 0, bytesRead);
+      bytesRead = in.read(buf);
+    }
+  }
+
+  private static void copyByteByByte(InputStream in, OutputStream out)
+    throws IOException {
+
+    int ch = in.read();
+    while (ch >= 0) {
+      out.write(ch);
+      ch = in.read();
+    }
+  }
+
+  private static void copyBytes(InputStream in, OutputStream out, int buffSize)
+    throws IOException {
+
+    byte buf[] = new byte[buffSize];
+    int bytesRead = in.read(buf);
+    while (bytesRead >= 0) {
+      out.write(buf, 0, bytesRead);
+      bytesRead = in.read(buf);
+    }
+  }
+
+  private File createFile(long sizeInKB) throws IOException {
+    File tmpFile = createFile();
+    writeToFile(tmpFile, sizeInKB);
+    return tmpFile;
+  }
+
+  private File createFile() throws IOException {
+    return File.createTempFile("tmp", "dat");
+  }
+
+  private void writeToFile(File tmpFile, long sizeInKB) throws IOException {
+    OutputStream out = new FileOutputStream(tmpFile);
+    try {
+      byte[] buffer = new byte [1024];
+      for (long index = 0; index < sizeInKB; index++) {
+        out.write(buffer);
+      }
+    } finally {
+      IOUtils.closeStream(out);
+    }
+  }
+}

+ 57 - 0
hadoop-tools/hadoop-distcp/src/test/resources/sslConfig.xml

@@ -0,0 +1,57 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+
+<property>
+  <name>ssl.client.truststore.location</name>
+  <value>/path/to/truststore/keys/keystore.jks</value>
+  <description>Truststore to be used by clients like distcp. Must be
+  specified.
+  </description>
+</property>
+
+<property>
+  <name>ssl.client.truststore.password</name>
+  <value>changeit</value>
+  <description>Optional. Default value is "".
+  </description>
+</property>
+
+<property>
+  <name>ssl.client.truststore.type</name>
+  <value>jks</value>
+  <description>Optional. Default value is "jks".
+  </description>
+</property>
+
+<property>
+  <name>ssl.client.keystore.location</name>
+  <value>/path/to/keystore/keys/keystore.jks</value>
+  <description>Keystore to be used by clients like distcp. Must be
+  specified.
+  </description>
+</property>
+
+<property>
+  <name>ssl.client.keystore.password</name>
+  <value>changeit</value>
+  <description>Optional. Default value is "".
+  </description>
+</property>
+
+<property>
+  <name>ssl.client.keystore.keypassword</name>
+  <value>changeit</value>
+  <description>Optional. Default value is "".
+  </description>
+</property>
+
+<property>
+  <name>ssl.client.keystore.type</name>
+  <value>jks</value>
+  <description>Optional. Default value is "jks".
+  </description>
+</property>
+
+</configuration>

+ 1 - 0
hadoop-tools/pom.xml

@@ -29,6 +29,7 @@
 
 
   <modules>
   <modules>
     <module>hadoop-streaming</module>
     <module>hadoop-streaming</module>
+    <module>hadoop-distcp</module>
     <module>hadoop-archives</module>
     <module>hadoop-archives</module>
     <module>hadoop-rumen</module>
     <module>hadoop-rumen</module>
     <module>hadoop-tools-dist</module>
     <module>hadoop-tools-dist</module>