浏览代码

Merge r1234388 through r1236385 from 0.23.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23-PB@1236395 13f79535-47bb-0310-9956-ffa450edef68
Tsz-wo Sze 13 年之前
父节点
当前提交
359c746ca7
共有 100 个文件被更改,包括 6200 次插入394 次删除
  1. 11 0
      hadoop-assemblies/src/main/resources/assemblies/hadoop-mapreduce-dist.xml
  2. 18 0
      hadoop-common-project/hadoop-common/CHANGES.txt
  3. 0 5
      hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/commands_manual.xml
  4. 6 0
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configuration.java
  5. 8 6
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Path.java
  6. 11 3
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Stat.java
  7. 18 7
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/Decompressor.java
  8. 11 1
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
  9. 0 14
      hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
  10. 6 0
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/conf/TestConfiguration.java
  11. 32 0
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserFromEnv.java
  12. 2 2
      hadoop-common-project/hadoop-common/src/test/resources/testConf.xml
  13. 1 1
      hadoop-hdfs-project/dev-support/test-patch.properties
  14. 5 0
      hadoop-hdfs-project/hadoop-hdfs-httpfs/pom.xml
  15. 2 2
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/HttpFSServer.java
  16. 19 0
      hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
  17. 9 11
      hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/content/xdocs/hdfs_user_guide.xml
  18. 9 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
  19. 19 11
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
  20. 9 5
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
  21. 0 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
  22. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp
  23. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp
  24. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp
  25. 4 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/BenchmarkThroughput.java
  26. 118 2
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java
  27. 15 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java
  28. 25 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java
  29. 0 26
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
  30. 0 4
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameEditsConfigs.java
  31. 0 1
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecondaryWebUi.java
  32. 0 2
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java
  33. 0 2
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java
  34. 56 0
      hadoop-mapreduce-project/CHANGES.txt
  35. 8 13
      hadoop-mapreduce-project/bin/mapred
  36. 3 3
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java
  37. 4 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/pom.xml
  38. 9 6
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapreduce/v2/util/MRApps.java
  39. 18 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/test/java/org/apache/hadoop/mapreduce/v2/util/TestMRApps.java
  40. 4 2
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java
  41. 1 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java
  42. 1 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java
  43. 2 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java
  44. 17 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/EventFetcher.java
  45. 15 2
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/Fetcher.java
  46. 26 12
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/MergeManager.java
  47. 4 15
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/Shuffle.java
  48. 7 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml
  49. 0 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/main/java/org/apache/hadoop/mapred/ClientServiceDelegate.java
  50. 1 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/main/java/org/apache/hadoop/mapred/YARNRunner.java
  51. 1 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/GenericMRLoadGenerator.java
  52. 101 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java
  53. 0 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/ThreadedMapBenchmark.java
  54. 0 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/GenericMRLoadGenerator.java
  55. 757 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/RandomTextWriter.java
  56. 298 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/RandomWriter.java
  57. 111 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestFileInputFormat.java
  58. 0 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/test/MapredTestDriver.java
  59. 2 2
      hadoop-mapreduce-project/hadoop-mapreduce-examples/pom.xml
  60. 3 2
      hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java
  61. 1 15
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/ApplicationConstants.java
  62. 4 0
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
  63. 3 0
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApp.java
  64. 14 0
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml
  65. 7 10
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApp.java
  66. 29 1
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueUtils.java
  67. 6 2
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java
  68. 59 61
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
  69. 19 18
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
  70. 1 1
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java
  71. 62 16
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java
  72. 7 7
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java
  73. 52 62
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestParentQueue.java
  74. 31 1
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestQueueParsing.java
  75. 13 10
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesCapacitySched.java
  76. 8 5
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/src/main/java/org/apache/hadoop/yarn/server/webproxy/amfilter/AmIpFilter.java
  77. 121 0
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/src/test/java/org/apache/hadoop/yarn/server/webproxy/amfilter/TestAmFilter.java
  78. 10 10
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm
  79. 49 0
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/WebApplicationProxy.apt.vm
  80. 2 0
      hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/index.apt.vm
  81. 23 0
      hadoop-project/pom.xml
  82. 1 0
      hadoop-project/src/site/site.xml
  83. 7 0
      hadoop-tools/hadoop-distcp/README
  84. 198 0
      hadoop-tools/hadoop-distcp/pom.xml
  85. 218 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyListing.java
  86. 405 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCp.java
  87. 104 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
  88. 218 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java
  89. 525 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java
  90. 100 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/FileBasedCopyListing.java
  91. 105 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java
  92. 246 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java
  93. 275 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java
  94. 297 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java
  95. 330 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyMapper.java
  96. 124 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyOutputFormat.java
  97. 56 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableDirectoryCreateCommand.java
  98. 245 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java
  99. 169 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/UniformSizeInputFormat.java
  100. 246 0
      hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputChunk.java

+ 11 - 0
hadoop-assemblies/src/main/resources/assemblies/hadoop-mapreduce-dist.xml

@@ -127,6 +127,17 @@
         <unpack>false</unpack>
       </binaries>
     </moduleSet>
+    <moduleSet>
+      <includes>
+        <include>org.apache.hadoop:hadoop-mapreduce-client-jobclient</include>
+      </includes>
+      <binaries>
+        <attachmentClassifier>tests</attachmentClassifier>
+        <outputDirectory>share/hadoop/${hadoop.component}</outputDirectory>
+        <includeDependencies>false</includeDependencies>
+        <unpack>false</unpack>
+      </binaries>
+    </moduleSet>
   </moduleSets>
   <dependencySets>
     <dependencySet>

+ 18 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -125,6 +125,21 @@ Release 0.23.1 - Unreleased
 
     HADOOP-7975. Add LZ4 as an entry in the default codec list, missed by HADOOP-7657 (harsh)
 
+    HADOOP-7987. Support setting the run-as user in unsecure mode. (jitendra)
+
+    HADOOP-4515. Configuration#getBoolean must not be case sensitive. (Sho Shimauchi via harsh)
+
+    HADOOP-6490. Use StringUtils over String#replace in Path#normalizePath.
+    (Uma Maheswara Rao G via harsh)
+
+    HADOOP-7574. Improve FSShell -stat, add user/group elements.
+    (XieXianshan via harsh)
+
+    HADOOP-7736. Remove duplicate Path#normalizePath call. (harsh)
+
+    HADOOP-7919. Remove the unused hadoop.logfile.* properties from the 
+    core-default.xml file. (harsh)
+
   OPTIMIZATIONS
 
   BUG FIXES
@@ -207,6 +222,9 @@ Release 0.23.1 - Unreleased
    HADOOP-7986. Adding config for MapReduce History Server protocol in
    hadoop-policy.xml for service level authorization. (Mahadev Konar via vinodkv)
 
+   HADOOP-7981. Improve documentation for org.apache.hadoop.io.compress.
+   Decompressor.getRemaining (Jonathan Eagles via mahadev)
+
 Release 0.23.0 - 2011-11-01 
 
   INCOMPATIBLE CHANGES

+ 0 - 5
hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/commands_manual.xml

@@ -753,11 +753,6 @@
 			
 			<section>
 				<title> secondarynamenode </title>
-				<note>
-					The Secondary NameNode has been deprecated. Instead, consider using the
-					<a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Checkpoint+Node">Checkpoint Node</a> or 
-					<a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Backup+Node">Backup Node</a>. 
-				</note>
 				<p>	
 					Runs the HDFS secondary 
 					namenode. See <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Secondary+NameNode">Secondary NameNode</a> 

+ 6 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configuration.java

@@ -826,6 +826,12 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
    */
   public boolean getBoolean(String name, boolean defaultValue) {
     String valueString = getTrimmed(name);
+    if (null == valueString || "".equals(valueString)) {
+      return defaultValue;
+    }
+
+    valueString = valueString.toLowerCase();
+
     if ("true".equals(valueString))
       return true;
     else if ("false".equals(valueString))

+ 8 - 6
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Path.java

@@ -18,10 +18,12 @@
 
 package org.apache.hadoop.fs;
 
-import java.net.*;
-import java.io.*;
-import org.apache.avro.reflect.Stringable;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
 
+import org.apache.avro.reflect.Stringable;
+import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
@@ -76,7 +78,7 @@ public class Path implements Comparable {
     }
     URI resolved = parentUri.resolve(child.uri);
     initialize(resolved.getScheme(), resolved.getAuthority(),
-               normalizePath(resolved.getPath()), resolved.getFragment());
+               resolved.getPath(), resolved.getFragment());
   }
 
   private void checkPathArg( String path ) {
@@ -158,8 +160,8 @@ public class Path implements Comparable {
 
   private String normalizePath(String path) {
     // remove double slashes & backslashes
-    path = path.replace("//", "/");
-    path = path.replace("\\", "/");
+    path = StringUtils.replace(path, "//", "/");
+    path = StringUtils.replace(path, "\\", "/");
     
     // trim trailing slash from non-root path (ignoring windows drive)
     int minLength = hasWindowsDrive(path, true) ? 4 : 1;

+ 11 - 3
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Stat.java

@@ -32,9 +32,11 @@ import org.apache.hadoop.fs.FileStatus;
  * Print statistics about path in specified format.
  * Format sequences:
  *   %b: Size of file in blocks
+ *   %g: Group name of owner
  *   %n: Filename
  *   %o: Block size
  *   %r: replication
+ *   %u: User name of owner
  *   %y: UTC date as &quot;yyyy-MM-dd HH:mm:ss&quot;
  *   %Y: Milliseconds since January 1, 1970 UTC
  */
@@ -50,8 +52,8 @@ class Stat extends FsCommand {
   public static final String USAGE = "[format] <path> ...";
   public static final String DESCRIPTION =
     "Print statistics about the file/directory at <path>\n" +
-    "in the specified format. Format accepts filesize in blocks (%b), filename (%n),\n" +
-    "block size (%o), replication (%r), modification date (%y, %Y)\n";
+    "in the specified format. Format accepts filesize in blocks (%b), group name of owner(%g),\n" +
+    "filename (%n), block size (%o), replication (%r), user name of owner(%u), modification date (%y, %Y)\n";
 
   protected static final SimpleDateFormat timeFmt;
   static {
@@ -92,6 +94,9 @@ class Stat extends FsCommand {
                 ? "directory" 
                 : (stat.isFile() ? "regular file" : "symlink"));
             break;
+          case 'g':
+            buf.append(stat.getGroup());
+            break;
           case 'n':
             buf.append(item.path.getName());
             break;
@@ -101,6 +106,9 @@ class Stat extends FsCommand {
           case 'r':
             buf.append(stat.getReplication());
             break;
+          case 'u':
+            buf.append(stat.getOwner());
+            break;
           case 'y':
             buf.append(timeFmt.format(new Date(stat.getModificationTime())));
             break;
@@ -118,4 +126,4 @@ class Stat extends FsCommand {
     }
     out.println(buf.toString());
   }
-}
+}

+ 18 - 7
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/Decompressor.java

@@ -49,7 +49,7 @@ public interface Decompressor {
   public void setInput(byte[] b, int off, int len);
   
   /**
-   * Returns true if the input data buffer is empty and 
+   * Returns <code>true</code> if the input data buffer is empty and 
    * {@link #setInput(byte[], int, int)} should be called to
    * provide more input. 
    * 
@@ -76,8 +76,11 @@ public interface Decompressor {
   public boolean needsDictionary();
 
   /**
-   * Returns true if the end of the decompressed 
-   * data output stream has been reached.
+   * Returns <code>true</code> if the end of the decompressed 
+   * data output stream has been reached. Indicates a concatenated data stream
+   * when finished() returns <code>true</code> and {@link #getRemaining()}
+   * returns a positive value. finished() will be reset with the
+   * {@link #reset()} method.
    * @return <code>true</code> if the end of the decompressed
    * data output stream has been reached.
    */
@@ -98,15 +101,23 @@ public interface Decompressor {
   public int decompress(byte[] b, int off, int len) throws IOException;
 
   /**
-   * Returns the number of bytes remaining in the compressed-data buffer;
-   * typically called after the decompressor has finished decompressing
-   * the current gzip stream (a.k.a. "member").
+   * Returns the number of bytes remaining in the compressed data buffer.
+   * Indicates a concatenated data stream if {@link #finished()} returns
+   * <code>true</code> and getRemaining() returns a positive value. If
+   * {@link #finished()} returns <code>true</code> and getRemaining() returns
+   * a zero value, indicates that the end of data stream has been reached and
+   * is not a concatenated data stream. 
+   * @return The number of bytes remaining in the compressed data buffer.
    */
   public int getRemaining();
 
   /**
    * Resets decompressor and input and output buffers so that a new set of
-   * input data can be processed.
+   * input data can be processed. If {@link #finished()}} returns
+   * <code>true</code> and {@link #getRemaining()} returns a positive value,
+   * reset() is called before processing of the next data stream in the
+   * concatenated data stream. {@link #finished()} will be reset and will
+   * return <code>false</code> when reset() is called.
    */
   public void reset();
 

+ 11 - 1
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java

@@ -80,6 +80,7 @@ public class UserGroupInformation {
    * Percentage of the ticket window to use before we renew ticket.
    */
   private static final float TICKET_RENEW_WINDOW = 0.80f;
+  static final String HADOOP_USER_NAME = "HADOOP_USER_NAME";
   
   /** 
    * UgiMetrics maintains UGI activity statistics
@@ -137,7 +138,16 @@ public class UserGroupInformation {
           LOG.debug("using kerberos user:"+user);
         }
       }
-      // if we don't have a kerberos user, use the OS user
+      //If we don't have a kerberos user and security is disabled, check
+      //if user is specified in the environment or properties
+      if (!isSecurityEnabled() && (user == null)) {
+        String envUser = System.getenv(HADOOP_USER_NAME);
+        if (envUser == null) {
+          envUser = System.getProperty(HADOOP_USER_NAME);
+        }
+        user = envUser == null ? null : new User(envUser);
+      }
+      // use the OS user
       if (user == null) {
         user = getCanonicalUser(OS_PRINCIPAL_CLASS);
         if (LOG.isDebugEnabled()) {

+ 0 - 14
hadoop-common-project/hadoop-common/src/main/resources/core-default.xml

@@ -134,20 +134,6 @@
   </description>
 </property>
 
-<!--- logging properties -->
-
-<property>
-  <name>hadoop.logfile.size</name>
-  <value>10000000</value>
-  <description>The max size of each log file</description>
-</property>
-
-<property>
-  <name>hadoop.logfile.count</name>
-  <value>10</value>
-  <description>The max number of log files</description>
-</property>
-
 <!-- i/o properties -->
 <property>
   <name>io.file.buffer.size</name>

+ 6 - 0
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/conf/TestConfiguration.java

@@ -451,6 +451,9 @@ public class TestConfiguration extends TestCase {
     appendProperty("test.bool3", "  true ");
     appendProperty("test.bool4", " false ");
     appendProperty("test.bool5", "foo");
+    appendProperty("test.bool6", "TRUE");
+    appendProperty("test.bool7", "FALSE");
+    appendProperty("test.bool8", "");
     endConfig();
     Path fileResource = new Path(CONFIG);
     conf.addResource(fileResource);
@@ -459,6 +462,9 @@ public class TestConfiguration extends TestCase {
     assertEquals(true, conf.getBoolean("test.bool3", false));
     assertEquals(false, conf.getBoolean("test.bool4", true));
     assertEquals(true, conf.getBoolean("test.bool5", true));
+    assertEquals(true, conf.getBoolean("test.bool6", false));
+    assertEquals(false, conf.getBoolean("test.bool7", true));
+    assertEquals(false, conf.getBoolean("test.bool8", false));
   }
   
   public void testFloatValues() throws IOException {

+ 32 - 0
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserFromEnv.java

@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.security;
+
+import java.io.IOException;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestUserFromEnv {
+
+  @Test
+  public void testUserFromEnvironment() throws IOException {
+    System.setProperty(UserGroupInformation.HADOOP_USER_NAME, "randomUser");
+    Assert.assertEquals("randomUser", UserGroupInformation.getLoginUser()
+        .getUserName());
+  }
+}

+ 2 - 2
hadoop-common-project/hadoop-common/src/test/resources/testConf.xml

@@ -610,11 +610,11 @@
         </comparator>
         <comparator>
           <type>RegexpComparator</type>
-          <expected-output>^( |\t)*in the specified format. Format accepts filesize in blocks \(%b\), filename \(%n\),( )*</expected-output>
+          <expected-output>^( |\t)*in the specified format. Format accepts filesize in blocks \(%b\), group name of owner\(%g\),( )*</expected-output>
         </comparator>
         <comparator>
           <type>RegexpComparator</type>
-          <expected-output>^( |\t)*block size \(%o\), replication \(%r\), modification date \(%y, %Y\)( )*</expected-output>
+          <expected-output>^( |\t)*filename \(%n\), block size \(%o\), replication \(%r\), user name of owner\(%u\), modification date \(%y, %Y\)( )*</expected-output>
         </comparator>
       </comparators>
     </test>

+ 1 - 1
hadoop-hdfs-project/dev-support/test-patch.properties

@@ -18,4 +18,4 @@
 
 OK_RELEASEAUDIT_WARNINGS=0
 OK_FINDBUGS_WARNINGS=0
-OK_JAVADOC_WARNINGS=2
+OK_JAVADOC_WARNINGS=0

+ 5 - 0
hadoop-hdfs-project/hadoop-hdfs-httpfs/pom.xml

@@ -53,6 +53,11 @@
       <artifactId>mockito-all</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-annotations</artifactId>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>com.sun.jersey</groupId>
       <artifactId>jersey-server</artifactId>

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/HttpFSServer.java

@@ -219,7 +219,7 @@ public class HttpFSServer {
    * operation is @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.GetOpValues#LISTSTATUS}
    * @param doAs user being impersonated, defualt value is none. It can be used
    * only if the current user is a HttpFSServer proxyuser.
-   * @param override, default is true. Used only for
+   * @param override default is true. Used only for
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
    * @param blockSize block size to set, used only by
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
@@ -419,7 +419,7 @@ public class HttpFSServer {
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
    * @param group group to set, used only for
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
-   * @param override, default is true. Used only for
+   * @param override default is true. Used only for
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
    * @param blockSize block size to set, used only by
    * @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.

+ 19 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -201,6 +201,10 @@ Release 0.23.1 - UNRELEASED
 
     HDFS-2817. Combine the two TestSafeMode test suites. (todd)
 
+    HDFS-2818. Fix a missing space issue in HDFS webapps' title tags. (Devaraj K via harsh)
+
+    HDFS-2397. Undeprecate SecondaryNameNode (eli)
+
   OPTIMIZATIONS
 
     HDFS-2130. Switch default checksum to CRC32C. (todd)
@@ -215,6 +219,12 @@ Release 0.23.1 - UNRELEASED
     for a client on the same node as the block file.  (Andrew Purtell,
     Suresh Srinivas and Jitendra Nath Pandey via szetszwo)
 
+    HDFS-2825. Add test hook to turn off the writer preferring its local
+    DN. (todd)
+
+    HDFS-2826. Add test case for HDFS-1476 (safemode can initialize
+    replication queues before exiting) (todd)
+
   BUG FIXES
 
     HDFS-2541. For a sufficiently large value of blocks, the DN Scanner 
@@ -276,6 +286,15 @@ Release 0.23.1 - UNRELEASED
     HDFS-2816. Fix missing license header in httpfs findbugsExcludeFile.xml.
     (hitesh via tucu)
 
+    HDFS-2822. processMisReplicatedBlock incorrectly identifies
+    under-construction blocks as under-replicated. (todd)
+
+    HDFS-442. dfsthroughput in test jar throws NPE (harsh)
+
+    HDFS-2836. HttpFSServer still has 2 javadoc warnings in trunk (revans2 via tucu)
+
+    HDFS-2837. mvn javadoc:javadoc not seeing LimitedPrivate class (revans2 via tucu)
+
 Release 0.23.0 - 2011-11-01 
 
   INCOMPATIBLE CHANGES

+ 9 - 11
hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/content/xdocs/hdfs_user_guide.xml

@@ -112,17 +112,18 @@
     		problems.
     	</li>
     	<li>
-    		Secondary NameNode (deprecated): performs periodic checkpoints of the 
+    		Secondary NameNode: performs periodic checkpoints of the 
     		namespace and helps keep the size of file containing log of HDFS 
     		modifications within certain limits at the NameNode.
-    		Replaced by Checkpoint node.
     	</li>
+
     	<li>
     		Checkpoint node: performs periodic checkpoints of the namespace and
     		helps minimize the size of the log stored at the NameNode 
     		containing changes to the HDFS.
-    		Replaces the role previously filled by the Secondary NameNode. 
-    		NameNode allows multiple Checkpoint nodes simultaneously, 
+    		Replaces the role previously filled by the Secondary NameNode,
+                though is not yet battle hardened.
+    		The NameNode allows multiple Checkpoint nodes simultaneously, 
     		as long as there are no Backup nodes registered with the system.
     	</li>
     	<li>
@@ -132,6 +133,7 @@
     		which is always in sync with the active NameNode namespace state.
     		Only one Backup node may be registered with the NameNode at once.
     	</li>
+
       </ul>
     </li>
     </ul>
@@ -234,12 +236,6 @@
    
    </section> 
 	<section> <title>Secondary NameNode</title>
-   <note>
-   The Secondary NameNode has been deprecated. 
-   Instead, consider using the 
-   <a href="hdfs_user_guide.html#Checkpoint+node">Checkpoint Node</a> or 
-   <a href="hdfs_user_guide.html#Backup+node">Backup Node</a>.
-   </note>
    <p>	
      The NameNode stores modifications to the file system as a log
      appended to a native file system file, <code>edits</code>. 
@@ -287,7 +283,9 @@
      <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#secondarynamenode">secondarynamenode</a>.
    </p>
    
-   </section><section> <title> Checkpoint Node </title>
+   </section>
+
+   <section> <title> Checkpoint Node </title>
    <p>NameNode persists its namespace using two files: <code>fsimage</code>,
       which is the latest checkpoint of the namespace and <code>edits</code>,
       a journal (log) of changes to the namespace since the checkpoint.

+ 9 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

@@ -1793,7 +1793,8 @@ public class BlockManager {
   public void processMisReplicatedBlocks() {
     assert namesystem.hasWriteLock();
 
-    long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0;
+    long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0,
+         nrUnderConstruction = 0;
     neededReplications.clear();
     for (BlockInfo block : blocksMap.getBlocks()) {
       INodeFile fileINode = block.getINode();
@@ -1803,6 +1804,12 @@ public class BlockManager {
         addToInvalidates(block);
         continue;
       }
+      if (!block.isComplete()) {
+        // Incomplete blocks are never considered mis-replicated --
+        // they'll be reached when they are completed or recovered.
+        nrUnderConstruction++;
+        continue;
+      }
       // calculate current replication
       short expectedReplication = fileINode.getReplication();
       NumberReplicas num = countNodes(block);
@@ -1826,6 +1833,7 @@ public class BlockManager {
     LOG.info("Number of invalid blocks          = " + nrInvalid);
     LOG.info("Number of under-replicated blocks = " + nrUnderReplicated);
     LOG.info("Number of  over-replicated blocks = " + nrOverReplicated);
+    LOG.info("Number of blocks being written    = " + nrUnderConstruction);
   }
 
   /** Set replication for the blocks. */

+ 19 - 11
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java

@@ -38,6 +38,8 @@ import org.apache.hadoop.net.NetworkTopology;
 import org.apache.hadoop.net.Node;
 import org.apache.hadoop.net.NodeBase;
 
+import com.google.common.annotations.VisibleForTesting;
+
 /** The class is responsible for choosing the desired number of targets
  * for placing block replicas.
  * The replica placement strategy is that if the writer is on a datanode,
@@ -49,6 +51,7 @@ import org.apache.hadoop.net.NodeBase;
 @InterfaceAudience.Private
 public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
   private boolean considerLoad; 
+  private boolean preferLocalNode = true;
   private NetworkTopology clusterMap;
   private FSClusterStats stats;
   static final String enableDebugLogging = "For more information, please enable"
@@ -223,17 +226,17 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
     if (localMachine == null)
       return chooseRandom(NodeBase.ROOT, excludedNodes, 
                           blocksize, maxNodesPerRack, results);
-      
-    // otherwise try local machine first
-    Node oldNode = excludedNodes.put(localMachine, localMachine);
-    if (oldNode == null) { // was not in the excluded list
-      if (isGoodTarget(localMachine, blocksize,
-                       maxNodesPerRack, false, results)) {
-        results.add(localMachine);
-        return localMachine;
-      }
-    } 
-      
+    if (preferLocalNode) {
+      // otherwise try local machine first
+      Node oldNode = excludedNodes.put(localMachine, localMachine);
+      if (oldNode == null) { // was not in the excluded list
+        if (isGoodTarget(localMachine, blocksize,
+                         maxNodesPerRack, false, results)) {
+          results.add(localMachine);
+          return localMachine;
+        }
+      } 
+    }      
     // try a node on local rack
     return chooseLocalRack(localMachine, excludedNodes, 
                            blocksize, maxNodesPerRack, results);
@@ -568,5 +571,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
     }
     return cur;
   }
+  
+  @VisibleForTesting
+  void setPreferLocalNode(boolean prefer) {
+    this.preferLocalNode = prefer;
+  }
 }
 

+ 9 - 5
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -172,6 +172,7 @@ import org.apache.hadoop.util.VersionInfo;
 import org.mortbay.util.ajax.JSON;
 
 import com.google.common.base.Preconditions;
+import com.google.common.annotations.VisibleForTesting;
 
 /***************************************************
  * FSNamesystem does the actual bookkeeping work for the
@@ -2842,7 +2843,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     /** Total number of blocks. */
     int blockTotal; 
     /** Number of safe blocks. */
-    private int blockSafe;
+    int blockSafe;
     /** Number of blocks needed to satisfy safe mode threshold condition */
     private int blockThreshold;
     /** Number of blocks needed before populating replication queues */
@@ -2850,7 +2851,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     /** time of the last status printout */
     private long lastStatusReport = 0;
     /** flag indicating whether replication queues have been initialized */
-    private boolean initializedReplQueues = false;
+    boolean initializedReplQueues = false;
     /** Was safemode entered automatically because available resources were low. */
     private boolean resourcesLow = false;
     
@@ -2980,9 +2981,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
      */
     private synchronized void initializeReplQueues() {
       LOG.info("initializing replication queues");
-      if (isPopulatingReplQueues()) {
-        LOG.warn("Replication queues already initialized.");
-      }
+      assert !isPopulatingReplQueues() : "Already initialized repl queues";
       long startTimeMisReplicatedScan = now();
       blockManager.processMisReplicatedBlocks();
       initializedReplQueues = true;
@@ -4412,4 +4411,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       byte[] password) throws InvalidToken {
     getDelegationTokenSecretManager().verifyToken(identifier, password);
   }
+
+  @VisibleForTesting
+  public SafeModeInfo getSafeModeInfoForTests() {
+    return safeMode;
+  }
 }

+ 0 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java

@@ -87,7 +87,6 @@ import com.google.common.collect.ImmutableList;
  * primary NameNode.
  *
  **********************************************************/
-@Deprecated // use BackupNode with -checkpoint argument instead.
 @InterfaceAudience.Private
 public class SecondaryNameNode implements Runnable {
     

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp

@@ -41,7 +41,7 @@
 <!DOCTYPE html>
 <html>
 <link rel="stylesheet" type="text/css" href="/static/hadoop.css">
-<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
+<title>Hadoop <%=namenodeRole%>&nbsp;<%=namenodeLabel%></title>
 <body>
 <h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>
 <%=NamenodeJspHelper.getVersionTable(fsn)%>

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp

@@ -37,7 +37,7 @@
 <html>
 
 <link rel="stylesheet" type="text/css" href="/static/hadoop.css">
-<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
+<title>Hadoop <%=namenodeRole%>&nbsp;<%=namenodeLabel%></title>
     
 <body>
 <h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp

@@ -37,7 +37,7 @@ String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":" + nn.getNameN
 <html>
 
 <link rel="stylesheet" type="text/css" href="/static/hadoop.css">
-<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
+<title>Hadoop <%=namenodeRole%>&nbsp;<%=namenodeLabel%></title>
   
 <body>
 <h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>

+ 4 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/BenchmarkThroughput.java

@@ -193,6 +193,10 @@ public class BenchmarkThroughput extends Configured implements Tool {
     BUFFER_SIZE = conf.getInt("dfsthroughput.buffer.size", 4 * 1024);
 
     String localDir = conf.get("mapred.temp.dir");
+    if (localDir == null) {
+      localDir = conf.get("hadoop.tmp.dir");
+      conf.set("mapred.temp.dir", localDir);
+    }
     dir = new LocalDirAllocator("mapred.temp.dir");
 
     System.setProperty("test.build.data", localDir);

+ 118 - 2
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSafeMode.java

@@ -19,22 +19,37 @@
 package org.apache.hadoop.hdfs;
 
 import java.io.IOException;
+import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.test.GenericTestUtils;
 
 import static org.junit.Assert.*;
 import org.junit.Before;
 import org.junit.After;
 import org.junit.Test;
 
+import com.google.common.base.Supplier;
+import com.google.common.collect.Lists;
+
 /**
  * Tests to verify safe mode correctness.
  */
 public class TestSafeMode {
+  private static final Path TEST_PATH = new Path("/test");
+  private static final int BLOCK_SIZE = 1024;
   Configuration conf; 
   MiniDFSCluster cluster;
   FileSystem fs;
@@ -43,6 +58,7 @@ public class TestSafeMode {
   @Before
   public void startUp() throws IOException {
     conf = new HdfsConfiguration();
+    conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
     cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
     cluster.waitActive();      
     fs = cluster.getFileSystem();
@@ -83,7 +99,7 @@ public class TestSafeMode {
     
     // create two files with one block each.
     DFSTestUtil.createFile(fs, file1, 1000, (short)1, 0);
-    DFSTestUtil.createFile(fs, file2, 2000, (short)1, 0);
+    DFSTestUtil.createFile(fs, file2, 1000, (short)1, 0);
     fs.close();
     cluster.shutdown();
     
@@ -127,6 +143,106 @@ public class TestSafeMode {
     String status = cluster.getNameNode().getNamesystem().getSafemode();
     assertEquals("", status);
   }
+  
+  /**
+   * Test that the NN initializes its under-replicated blocks queue
+   * before it is ready to exit safemode (HDFS-1476)
+   */
+  @Test(timeout=45000)
+  public void testInitializeReplQueuesEarly() throws Exception {
+    // Spray the blocks around the cluster when we add DNs instead of
+    // concentrating all blocks on the first node.
+    BlockManagerTestUtil.setWritingPrefersLocalNode(
+        cluster.getNamesystem().getBlockManager(), false);
+    
+    cluster.startDataNodes(conf, 2, true, StartupOption.REGULAR, null);
+    cluster.waitActive();
+    DFSTestUtil.createFile(fs, TEST_PATH, 15*BLOCK_SIZE, (short)1, 1L);
+    
+    
+    List<DataNodeProperties> dnprops = Lists.newLinkedList();
+    dnprops.add(cluster.stopDataNode(0));
+    dnprops.add(cluster.stopDataNode(0));
+    dnprops.add(cluster.stopDataNode(0));
+    
+    cluster.getConfiguration(0).setFloat(
+        DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 1f/15f);
+    
+    cluster.restartNameNode();
+    final NameNode nn = cluster.getNameNode();
+    
+    String status = nn.getNamesystem().getSafemode();
+    assertEquals("Safe mode is ON.The reported blocks 0 needs additional " +
+        "15 blocks to reach the threshold 0.9990 of total blocks 15. " +
+        "Safe mode will be turned off automatically.", status);
+    assertFalse("Mis-replicated block queues should not be initialized " +
+        "until threshold is crossed",
+        NameNodeAdapter.safeModeInitializedReplQueues(nn));
+    
+    cluster.restartDataNode(dnprops.remove(0));
+
+    // Wait for the block report from the restarted DN to come in.
+    GenericTestUtils.waitFor(new Supplier<Boolean>() {
+      @Override
+      public Boolean get() {
+        return NameNodeAdapter.getSafeModeSafeBlocks(nn) > 0;
+      }
+    }, 10, 10000);
+    // SafeMode is fine-grain synchronized, so the processMisReplicatedBlocks
+    // call is still going on at this point - wait until it's done by grabbing
+    // the lock.
+    nn.getNamesystem().writeLock();
+    nn.getNamesystem().writeUnlock();
+    int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn);
+    assertTrue("Expected first block report to make some but not all blocks " +
+        "safe. Got: " + safe, safe >= 1 && safe < 15);
+    BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager());
+    
+    assertTrue(NameNodeAdapter.safeModeInitializedReplQueues(nn));
+    assertEquals(15 - safe, nn.getNamesystem().getUnderReplicatedBlocks());
+    
+    cluster.restartDataNodes();
+  }
+
+  /**
+   * Test that, when under-replicated blocks are processed at the end of
+   * safe-mode, blocks currently under construction are not considered
+   * under-construction or missing. Regression test for HDFS-2822.
+   */
+  @Test
+  public void testRbwBlocksNotConsideredUnderReplicated() throws IOException {
+    List<FSDataOutputStream> stms = Lists.newArrayList();
+    try {
+      // Create some junk blocks so that the NN doesn't just immediately
+      // exit safemode on restart.
+      DFSTestUtil.createFile(fs, new Path("/junk-blocks"),
+          BLOCK_SIZE*4, (short)1, 1L);
+      // Create several files which are left open. It's important to
+      // create several here, because otherwise the first iteration of the
+      // replication monitor will pull them off the replication queue and
+      // hide this bug from the test!
+      for (int i = 0; i < 10; i++) {
+        FSDataOutputStream stm = fs.create(
+            new Path("/append-" + i), true, BLOCK_SIZE, (short) 1, BLOCK_SIZE);
+        stms.add(stm);
+        stm.write(1);
+        stm.hflush();
+      }
+
+      cluster.restartNameNode();
+      FSNamesystem ns = cluster.getNameNode(0).getNamesystem();
+      BlockManagerTestUtil.updateState(ns.getBlockManager());
+      assertEquals(0, ns.getPendingReplicationBlocks());
+      assertEquals(0, ns.getCorruptReplicaBlocks());
+      assertEquals(0, ns.getMissingBlocksCount());
+
+    } finally {
+      for (FSDataOutputStream stm : stms) {
+        IOUtils.closeStream(stm);
+      }
+      cluster.shutdown();
+    }
+  }
 
   public interface FSRun {
     public abstract void run(FileSystem fs) throws IOException;
@@ -241,4 +357,4 @@ public class TestSafeMode {
     assertEquals("", cluster.getNamesystem().getSafemode());
   }
 
-}
+}

+ 15 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java

@@ -27,6 +27,8 @@ import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
 import org.apache.hadoop.util.Daemon;
 
+import com.google.common.base.Preconditions;
+
 public class BlockManagerTestUtil {
   public static void setNodeReplicationLimit(final BlockManager blockManager,
       final int limit) {
@@ -122,4 +124,17 @@ public class BlockManagerTestUtil {
     return blockManager.computeDatanodeWork();
   }
   
+  
+  /**
+   * Change whether the block placement policy will prefer the writer's
+   * local Datanode or not.
+   * @param prefer
+   */
+  public static void setWritingPrefersLocalNode(
+      BlockManager bm, boolean prefer) {
+    BlockPlacementPolicy bpp = bm.getBlockPlacementPolicy();
+    Preconditions.checkState(bpp instanceof BlockPlacementPolicyDefault,
+        "Must use default policy, got %s", bpp.getClass());
+    ((BlockPlacementPolicyDefault)bpp).setPreferLocalNode(prefer);
+  }
 }

+ 25 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java

@@ -24,6 +24,7 @@ import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
 import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
 import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem.SafeModeInfo;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
 import org.apache.hadoop.ipc.Server;
 
@@ -97,4 +98,28 @@ public class NameNodeAdapter {
       ns.readUnlock();
     }
   }
+  
+  /**
+   * @return the number of blocks marked safe by safemode, or -1
+   * if safemode is not running.
+   */
+  public static int getSafeModeSafeBlocks(NameNode nn) {
+    SafeModeInfo smi = nn.getNamesystem().getSafeModeInfoForTests();
+    if (smi == null) {
+      return -1;
+    }
+    return smi.blockSafe;
+  }
+  
+  /**
+   * @return true if safemode is not running, or if safemode has already
+   * initialized the replication queues
+   */
+  public static boolean safeModeInitializedReplQueues(NameNode nn) {
+    SafeModeInfo smi = nn.getNamesystem().getSafeModeInfoForTests();
+    if (smi == null) {
+      return true;
+    }
+    return smi.initializedReplQueues;
+  }
 }

+ 0 - 26
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java

@@ -203,7 +203,6 @@ public class TestCheckpoint extends TestCase {
   /*
    * Simulate namenode crashing after rolling edit log.
    */
-  @SuppressWarnings("deprecation")
   public void testSecondaryNamenodeError1()
     throws IOException {
     LOG.info("Starting testSecondaryNamenodeError1");
@@ -265,7 +264,6 @@ public class TestCheckpoint extends TestCase {
   /*
    * Simulate a namenode crash after uploading new image
    */
-  @SuppressWarnings("deprecation")
   public void testSecondaryNamenodeError2() throws IOException {
     LOG.info("Starting testSecondaryNamenodeError2");
     Configuration conf = new HdfsConfiguration();
@@ -324,7 +322,6 @@ public class TestCheckpoint extends TestCase {
   /*
    * Simulate a secondary namenode crash after rolling the edit log.
    */
-  @SuppressWarnings("deprecation")
   public void testSecondaryNamenodeError3() throws IOException {
     LOG.info("Starting testSecondaryNamenodeError3");
     Configuration conf = new HdfsConfiguration();
@@ -394,7 +391,6 @@ public class TestCheckpoint extends TestCase {
    * back to the name-node.
    * Used to truncate primary fsimage file.
    */
-  @SuppressWarnings("deprecation")
   public void testSecondaryFailsToReturnImage() throws IOException {
     LOG.info("Starting testSecondaryFailsToReturnImage");
     Configuration conf = new HdfsConfiguration();
@@ -471,7 +467,6 @@ public class TestCheckpoint extends TestCase {
    * @param errorType the ErrorSimulator type to trigger
    * @param exceptionSubstring an expected substring of the triggered exception
    */
-  @SuppressWarnings("deprecation")
   private void doSendFailTest(int errorType, String exceptionSubstring)
       throws IOException {
     Configuration conf = new HdfsConfiguration();
@@ -586,7 +581,6 @@ public class TestCheckpoint extends TestCase {
   /**
    * Test that the SecondaryNameNode properly locks its storage directories.
    */
-  @SuppressWarnings("deprecation")
   public void testSecondaryNameNodeLocking() throws Exception {
     // Start a primary NN so that the secondary will start successfully
     Configuration conf = new HdfsConfiguration();
@@ -679,7 +673,6 @@ public class TestCheckpoint extends TestCase {
    * 2. if the NN does not contain an image, importing a checkpoint
    *    succeeds and re-saves the image
    */
-  @SuppressWarnings("deprecation")
   public void testImportCheckpoint() throws Exception {
     Configuration conf = new HdfsConfiguration();
     Path testPath = new Path("/testfile");
@@ -760,16 +753,12 @@ public class TestCheckpoint extends TestCase {
       throw new IOException("Cannot create directory " + dir);
   }
   
-  // This deprecation suppress warning does not work due to known Java bug:
-  // http://bugs.sun.com/view_bug.do?bug_id=6460147
-  @SuppressWarnings("deprecation")
   SecondaryNameNode startSecondaryNameNode(Configuration conf
                                           ) throws IOException {
     conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, "0.0.0.0:0");
     return new SecondaryNameNode(conf);
   }
   
-  @SuppressWarnings("deprecation")
   SecondaryNameNode startSecondaryNameNode(Configuration conf, int index)
       throws IOException {
     Configuration snnConf = new Configuration(conf);
@@ -782,7 +771,6 @@ public class TestCheckpoint extends TestCase {
   /**
    * Tests checkpoint in HDFS.
    */
-  @SuppressWarnings("deprecation")
   public void testCheckpoint() throws IOException {
     Path file1 = new Path("checkpoint.dat");
     Path file2 = new Path("checkpoint2.dat");
@@ -1009,7 +997,6 @@ public class TestCheckpoint extends TestCase {
    * - it then fails again for the same reason
    * - it then tries to checkpoint a third time
    */
-  @SuppressWarnings("deprecation")
   public void testCheckpointAfterTwoFailedUploads() throws IOException {
     MiniDFSCluster cluster = null;
     SecondaryNameNode secondary = null;
@@ -1064,7 +1051,6 @@ public class TestCheckpoint extends TestCase {
    * 
    * @throws IOException
    */
-  @SuppressWarnings("deprecation")
   public void testMultipleSecondaryNamenodes() throws IOException {
     Configuration conf = new HdfsConfiguration();
     String nameserviceId1 = "ns1";
@@ -1114,7 +1100,6 @@ public class TestCheckpoint extends TestCase {
    * Test that the secondary doesn't have to re-download image
    * if it hasn't changed.
    */
-  @SuppressWarnings("deprecation")
   public void testSecondaryImageDownload() throws IOException {
     LOG.info("Starting testSecondaryImageDownload");
     Configuration conf = new HdfsConfiguration();
@@ -1197,7 +1182,6 @@ public class TestCheckpoint extends TestCase {
    * It verifies that this works even though the earlier-txid checkpoint gets
    * uploaded after the later-txid checkpoint.
    */
-  @SuppressWarnings("deprecation")
   public void testMultipleSecondaryNNsAgainstSameNN() throws Exception {
     Configuration conf = new HdfsConfiguration();
 
@@ -1283,7 +1267,6 @@ public class TestCheckpoint extends TestCase {
    * It verifies that one of the two gets an error that it's uploading a
    * duplicate checkpoint, and the other one succeeds.
    */
-  @SuppressWarnings("deprecation")
   public void testMultipleSecondaryNNsAgainstSameNN2() throws Exception {
     Configuration conf = new HdfsConfiguration();
 
@@ -1382,7 +1365,6 @@ public class TestCheckpoint extends TestCase {
    * is running. The secondary should shut itself down if if talks to a NN
    * with the wrong namespace.
    */
-  @SuppressWarnings("deprecation")
   public void testReformatNNBetweenCheckpoints() throws IOException {
     MiniDFSCluster cluster = null;
     SecondaryNameNode secondary = null;
@@ -1637,7 +1619,6 @@ public class TestCheckpoint extends TestCase {
   /**
    * Test that the 2NN triggers a checkpoint after the configurable interval
    */
-  @SuppressWarnings("deprecation")
   public void testCheckpointTriggerOnTxnCount() throws Exception {
     MiniDFSCluster cluster = null;
     SecondaryNameNode secondary = null;
@@ -1691,7 +1672,6 @@ public class TestCheckpoint extends TestCase {
    * logs that connect the 2NN's old checkpoint to the current txid
    * get archived. Then, the 2NN tries to checkpoint again.
    */
-  @SuppressWarnings("deprecation")
   public void testSecondaryHasVeryOutOfDateImage() throws IOException {
     MiniDFSCluster cluster = null;
     SecondaryNameNode secondary = null;
@@ -1729,7 +1709,6 @@ public class TestCheckpoint extends TestCase {
     }
   }
   
-  @SuppressWarnings("deprecation")
   public void testCommandLineParsing() throws ParseException {
     SecondaryNameNode.CommandLineOpts opts =
       new SecondaryNameNode.CommandLineOpts();
@@ -1764,7 +1743,6 @@ public class TestCheckpoint extends TestCase {
     } catch (ParseException e) {}
   }
 
-  @SuppressWarnings("deprecation")
   private void cleanup(SecondaryNameNode snn) {
     if (snn != null) {
       try {
@@ -1780,7 +1758,6 @@ public class TestCheckpoint extends TestCase {
    * Assert that if any two files have the same name across the 2NNs
    * and NN, they should have the same content too.
    */
-  @SuppressWarnings("deprecation")
   private void assertParallelFilesInvariant(MiniDFSCluster cluster,
       ImmutableList<SecondaryNameNode> secondaries) throws Exception {
     List<File> allCurrentDirs = Lists.newArrayList();
@@ -1792,7 +1769,6 @@ public class TestCheckpoint extends TestCase {
         ImmutableSet.of("VERSION"));    
   }
   
-  @SuppressWarnings("deprecation")
   private List<File> getCheckpointCurrentDirs(SecondaryNameNode secondary) {
     List<File> ret = Lists.newArrayList();
     for (URI u : secondary.getCheckpointDirs()) {
@@ -1802,7 +1778,6 @@ public class TestCheckpoint extends TestCase {
     return ret;
   }
 
-  @SuppressWarnings("deprecation")
   private CheckpointStorage spyOnSecondaryImage(SecondaryNameNode secondary1) {
     CheckpointStorage spy = Mockito.spy((CheckpointStorage)secondary1.getFSImage());;
     secondary1.setFSImage(spy);
@@ -1812,7 +1787,6 @@ public class TestCheckpoint extends TestCase {
   /**
    * A utility class to perform a checkpoint in a different thread.
    */
-  @SuppressWarnings("deprecation")
   private static class DoCheckpointThread extends Thread {
     private final SecondaryNameNode snn;
     private volatile Throwable thrown = null;

+ 0 - 4
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameEditsConfigs.java

@@ -106,9 +106,6 @@ public class TestNameEditsConfigs extends TestCase {
     assertTrue(!fileSys.exists(name));
   }
 
-  // This deprecation suppress warning does not work due to known Java bug:
-  // http://bugs.sun.com/view_bug.do?bug_id=6460147
-  @SuppressWarnings("deprecation")
   SecondaryNameNode startSecondaryNameNode(Configuration conf
                                           ) throws IOException {
     conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, "0.0.0.0:0");
@@ -128,7 +125,6 @@ public class TestNameEditsConfigs extends TestCase {
    * sure we are reading proper edits and image.
    * @throws Exception 
    */
-  @SuppressWarnings("deprecation")
   public void testNameEditsConfigs() throws Exception {
     Path file1 = new Path("TestNameEditsConfigs1");
     Path file2 = new Path("TestNameEditsConfigs2");

+ 0 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecondaryWebUi.java

@@ -30,7 +30,6 @@ import org.junit.Test;
 
 public class TestSecondaryWebUi {
 
-  @SuppressWarnings("deprecation")
   @Test
   public void testSecondaryWebUi() throws IOException {
     Configuration conf = new Configuration();

+ 0 - 2
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java

@@ -120,7 +120,6 @@ public class TestStartup extends TestCase {
    * start MiniDFScluster, create a file (to create edits) and do a checkpoint  
    * @throws IOException
    */
-  @SuppressWarnings("deprecation")
   public void createCheckPoint() throws IOException {
     LOG.info("--starting mini cluster");
     // manage dirs parameter set to false 
@@ -300,7 +299,6 @@ public class TestStartup extends TestCase {
    * secondary node copies fsimage and edits into correct separate directories.
    * @throws IOException
    */
-  @SuppressWarnings("deprecation")
   public void testSNNStartup() throws IOException{
     //setUpConfig();
     LOG.info("--starting SecondNN startup test");

+ 0 - 2
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java

@@ -153,7 +153,6 @@ public class TestStorageRestore {
    * 7. run doCheckpoint
    * 8. verify that all the image and edits files are the same.
    */
-  @SuppressWarnings("deprecation")
   @Test
   public void testStorageRestore() throws Exception {
     int numDatanodes = 0;
@@ -310,7 +309,6 @@ public class TestStorageRestore {
    * then try to perform a checkpoint. The NN should not serve up the image or
    * edits from the restored (empty) dir.
    */
-  @SuppressWarnings("deprecation")
   @Test
   public void testMultipleSecondaryCheckpoint() throws IOException {
     

+ 56 - 0
hadoop-mapreduce-project/CHANGES.txt

@@ -142,6 +142,14 @@ Release 0.23.1 - Unreleased
 
     MAPREDUCE-3692. yarn-resourcemanager out and log files can get big. (eli)
 
+    MAPREDUCE-3710. Improved FileInputFormat to return better locality for the
+    last split. (Siddarth Seth via vinodkv)
+
+    MAPREDUCE-2765. DistCp Rewrite. (Mithun Radhakrishnan via mahadev)
+
+    MAPREDUCE-3737. The Web Application Proxy's is not documented very well.
+    (Robert Evans via mahadev)
+
   OPTIMIZATIONS
 
     MAPREDUCE-3567. Extraneous JobConf objects in AM heap. (Vinod Kumar
@@ -165,7 +173,13 @@ Release 0.23.1 - Unreleased
     MAPREDUCE-3512. Batching JobHistory flushing to DFS so that we don't flush
     for every event slowing down AM. (Siddarth Seth via vinodkv)
 
+    MAPREDUCE-3718. Change default AM heartbeat interval to 1 second. (Hitesh
+    Shah via sseth)
+
   BUG FIXES
+    MAPREDUCE-3194. "mapred mradmin" command is broken in mrv2
+                     (Jason Lowe via bobby)
+
     MAPREDUCE-3462. Fix Gridmix JUnit testcase failures. 
                     (Ravi Prakash and Ravi Gummadi via amarrk)
 
@@ -498,6 +512,48 @@ Release 0.23.1 - Unreleased
 
     MAPREDUCE-3705. ant build fails on 0.23 branch. (Thomas Graves via
     mahadev)
+ 
+    MAPREDUCE-3691. webservices add support to compress response.
+    (Thomas Graves via mahadev)
+
+    MAPREDUCE-3702. internal server error trying access application master 
+    via proxy with filter enabled (Thomas Graves via mahadev)
+
+    MAPREDUCE-3646. Remove redundant URL info from "mapred job" output.
+    (Jonathan Eagles via mahadev)
+
+    MAPREDUCE-3681. Fixed computation of queue's usedCapacity. (acmurthy) 
+
+    MAPREDUCE-3505. yarn APPLICATION_CLASSPATH needs to be overridable. 
+    (ahmed via tucu)
+
+    MAPREDUCE-3714. Fixed EventFetcher and Fetcher threads to shut-down properly
+    so that reducers don't hang in corner cases. (vinodkv)
+
+    MAPREDUCE-3712. The mapreduce tar does not contain the hadoop-mapreduce-client-
+    jobclient-tests.jar. (mahadev)
+
+    MAPREDUCE-3717. JobClient test jar has missing files to run all the test programs.
+    (mahadev)
+
+    MAPREDUCE-3630. Fixes a NullPointer exception while running TeraGen - if a
+    map is asked to generate 0 records. (Mahadev Konar via sseth)
+
+    MAPREDUCE-3683. Fixed maxCapacity of queues to be product of parent
+    maxCapacities. (acmurthy)
+
+    MAPREDUCE-3713. Fixed the way head-room is allocated to applications by
+    CapacityScheduler so that it deducts current-usage per user and not
+    per-application. (Arun C Murthy via vinodkv)
+
+    MAPREDUCE-3721. Fixed a race in shuffle which caused reduces to hang.
+    (sseth via acmurthy) 
+
+    MAPREDUCE-3733. Add Apache License Header to hadoop-distcp/pom.xml.
+    (mahadev)
+
+    MAPREDUCE-3735. Add distcp jar to the distribution (tar).
+    (mahadev)
 
 Release 0.23.0 - 2011-11-01 
 

+ 8 - 13
hadoop-mapreduce-project/bin/mapred

@@ -30,9 +30,6 @@ fi
 function print_usage(){
   echo "Usage: mapred [--config confdir] COMMAND"
   echo "       where COMMAND is one of:"
-  echo "  mradmin              run a Map-Reduce admin client"
-  echo "  jobtracker           run the MapReduce job Tracker node" 
-  echo "  tasktracker          run a MapReduce task Tracker node" 
   echo "  pipes                run a Pipes job"
   echo "  job                  manipulate MapReduce jobs"
   echo "  queue                get information regarding JobQueues"
@@ -51,16 +48,7 @@ fi
 COMMAND=$1
 shift
 
-if [ "$COMMAND" = "mradmin" ] ; then
-  CLASS=org.apache.hadoop.mapred.tools.MRAdmin
-  HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
-elif [ "$COMMAND" = "jobtracker" ] ; then
-  CLASS=org.apache.hadoop.mapred.JobTracker
-  HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
-elif [ "$COMMAND" = "tasktracker" ] ; then
-  CLASS=org.apache.hadoop.mapred.TaskTracker
-  HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
-elif [ "$COMMAND" = "job" ] ; then
+if [ "$COMMAND" = "job" ] ; then
   CLASS=org.apache.hadoop.mapred.JobClient
 elif [ "$COMMAND" = "queue" ] ; then
   CLASS=org.apache.hadoop.mapred.JobQueueClient
@@ -75,6 +63,13 @@ elif [ "$COMMAND" = "classpath" ] ; then
 elif [ "$COMMAND" = "groups" ] ; then
   CLASS=org.apache.hadoop.mapred.tools.GetGroups
   HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "mradmin" ] \
+    || [ "$COMMAND" = "jobtracker" ] \
+    || [ "$COMMAND" = "tasktracker" ] ; then
+  echo "Sorry, the $COMMAND command is no longer supported."
+  echo "You may find similar functionality with the \"yarn\" shell command."
+  print_usage
+  exit
 else
   echo $COMMAND - invalid command
   print_usage

+ 3 - 3
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java

@@ -522,13 +522,13 @@ public abstract class TaskAttemptImpl implements
    * a parent CLC and use it for all the containers, so this should go away
    * once the mr-generated-classpath stuff is gone.
    */
-  private static String getInitialClasspath() throws IOException {
+  private static String getInitialClasspath(Configuration conf) throws IOException {
     synchronized (classpathLock) {
       if (initialClasspathFlag.get()) {
         return initialClasspath;
       }
       Map<String, String> env = new HashMap<String, String>();
-      MRApps.setClasspath(env);
+      MRApps.setClasspath(env, conf);
       initialClasspath = env.get(Environment.CLASSPATH.name());
       initialClasspathFlag.set(true);
       return initialClasspath;
@@ -631,7 +631,7 @@ public abstract class TaskAttemptImpl implements
       Apps.addToEnvironment(
           environment,  
           Environment.CLASSPATH.name(), 
-          getInitialClasspath());
+          getInitialClasspath(conf));
     } catch (IOException e) {
       throw new YarnException(e);
     }

+ 4 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/pom.xml

@@ -38,6 +38,10 @@
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-mapreduce-client-core</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-yarn-server-common</artifactId>
+    </dependency>
   </dependencies>
 
   <build>

+ 9 - 6
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapreduce/v2/util/MRApps.java

@@ -54,6 +54,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.LocalResource;
 import org.apache.hadoop.yarn.api.records.LocalResourceType;
 import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
 import org.apache.hadoop.yarn.util.Apps;
 import org.apache.hadoop.yarn.util.BuilderUtils;
@@ -171,7 +172,7 @@ public class MRApps extends Apps {
   }
 
   private static void setMRFrameworkClasspath(
-      Map<String, String> environment) throws IOException {
+      Map<String, String> environment, Configuration conf) throws IOException {
     InputStream classpathFileStream = null;
     BufferedReader reader = null;
     try {
@@ -208,8 +209,10 @@ public class MRApps extends Apps {
       }
 
       // Add standard Hadoop classes
-      for (String c : ApplicationConstants.APPLICATION_CLASSPATH) {
-        Apps.addToEnvironment(environment, Environment.CLASSPATH.name(), c);
+      for (String c : conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH)
+          .split(",")) {
+        Apps.addToEnvironment(environment, Environment.CLASSPATH.name(), c
+            .trim());
       }
     } finally {
       if (classpathFileStream != null) {
@@ -222,8 +225,8 @@ public class MRApps extends Apps {
     // TODO: Remove duplicates.
   }
   
-  public static void setClasspath(Map<String, String> environment) 
-      throws IOException {
+  public static void setClasspath(Map<String, String> environment,
+      Configuration conf) throws IOException {
     Apps.addToEnvironment(
         environment, 
         Environment.CLASSPATH.name(), 
@@ -232,7 +235,7 @@ public class MRApps extends Apps {
         environment, 
         Environment.CLASSPATH.name(),
         Environment.PWD.$() + Path.SEPARATOR + "*");
-    MRApps.setMRFrameworkClasspath(environment);
+    MRApps.setMRFrameworkClasspath(environment, conf);
   }
   
   private static final String STAGING_CONSTANT = ".staging";

+ 18 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/test/java/org/apache/hadoop/mapreduce/v2/util/TestMRApps.java

@@ -18,7 +18,12 @@
 
 package org.apache.hadoop.mapreduce.v2.util;
 
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.JobID;
 import org.apache.hadoop.mapreduce.MRJobConfig;
 import org.apache.hadoop.mapreduce.v2.api.records.JobId;
@@ -121,4 +126,17 @@ public class TestMRApps {
         "/my/path/to/staging/dummy-user/.staging/job_dummy-job_12345/job.xml", jobFile);
   }
 
+  @Test public void testSetClasspath() throws IOException {
+    Job job = Job.getInstance();
+    Map<String, String> environment = new HashMap<String, String>();
+    MRApps.setClasspath(environment, job.getConfiguration());
+    assertEquals("job.jar:$PWD/*:$HADOOP_CONF_DIR:" +
+        "$HADOOP_COMMON_HOME/share/hadoop/common/*:" +
+        "$HADOOP_COMMON_HOME/share/hadoop/common/lib/*:" +
+        "$HADOOP_HDFS_HOME/share/hadoop/hdfs/*:" +
+        "$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*:" +
+        "$YARN_HOME/share/hadoop/mapreduce/*:" +
+        "$YARN_HOME/share/hadoop/mapreduce/lib/*",
+        environment.get("CLASSPATH"));
+  }
 }

+ 4 - 2
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java

@@ -289,8 +289,10 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
         }
         
         if (bytesRemaining != 0) {
-          splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining, 
-                     blkLocations[blkLocations.length-1].getHosts()));
+          String[] splitHosts = getSplitHosts(blkLocations, length
+              - bytesRemaining, bytesRemaining, clusterMap);
+          splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
+              splitHosts));
         }
       } else if (length != 0) {
         String[] splitHosts = getSplitHosts(blkLocations,0,length,clusterMap);

+ 1 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java

@@ -1216,6 +1216,7 @@ public class Job extends JobContextImpl implements JobContext {
       }
     });
     state = JobState.RUNNING;
+    LOG.info("The url to track the job: " + getTrackingURL());
    }
   
   /**

+ 1 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java

@@ -417,7 +417,7 @@ public interface MRJobConfig {
   /** How often the AM should send heartbeats to the RM.*/
   public static final String MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS =
     MR_AM_PREFIX + "scheduler.heartbeat.interval-ms";
-  public static final int DEFAULT_MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS = 2000;
+  public static final int DEFAULT_MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS = 1000;
 
   /**
    * If contact with RM is lost, the AM will wait MR_AM_TO_RM_WAIT_INTERVAL_MS

+ 2 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java

@@ -286,8 +286,9 @@ public abstract class FileInputFormat<K, V> extends InputFormat<K, V> {
           }
 
           if (bytesRemaining != 0) {
+            int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
             splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
-                       blkLocations[blkLocations.length-1].getHosts()));
+                       blkLocations[blkIndex].getHosts()));
           }
         } else { // not splitable
           splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));

+ 17 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/EventFetcher.java

@@ -27,6 +27,7 @@ import org.apache.hadoop.mapred.TaskCompletionEvent;
 import org.apache.hadoop.mapred.TaskUmbilicalProtocol;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 
+@SuppressWarnings("deprecation")
 class EventFetcher<K,V> extends Thread {
   private static final long SLEEP_TIME = 1000;
   private static final int MAX_EVENTS_TO_FETCH = 10000;
@@ -41,6 +42,8 @@ class EventFetcher<K,V> extends Thread {
   private ExceptionReporter exceptionReporter = null;
   
   private int maxMapRuntime = 0;
+
+  private volatile boolean stopped = false;
   
   public EventFetcher(TaskAttemptID reduce,
                       TaskUmbilicalProtocol umbilical,
@@ -60,7 +63,7 @@ class EventFetcher<K,V> extends Thread {
     LOG.info(reduce + " Thread started: " + getName());
     
     try {
-      while (true && !Thread.currentThread().isInterrupted()) {
+      while (!stopped && !Thread.currentThread().isInterrupted()) {
         try {
           int numNewMaps = getMapCompletionEvents();
           failures = 0;
@@ -71,6 +74,9 @@ class EventFetcher<K,V> extends Thread {
           if (!Thread.currentThread().isInterrupted()) {
             Thread.sleep(SLEEP_TIME);
           }
+        } catch (InterruptedException e) {
+          LOG.info("EventFetcher is interrupted.. Returning");
+          return;
         } catch (IOException ie) {
           LOG.info("Exception in getting events", ie);
           // check to see whether to abort
@@ -90,6 +96,16 @@ class EventFetcher<K,V> extends Thread {
       return;
     }
   }
+
+  public void shutDown() {
+    this.stopped = true;
+    interrupt();
+    try {
+      join(5000);
+    } catch(InterruptedException ie) {
+      LOG.warn("Got interrupted while joining " + getName(), ie);
+    }
+  }
   
   /** 
    * Queries the {@link TaskTracker} for a set of map-completion events 

+ 15 - 2
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/Fetcher.java

@@ -48,6 +48,7 @@ import org.apache.hadoop.mapreduce.task.reduce.MapOutput.Type;
 import org.apache.hadoop.util.Progressable;
 import org.apache.hadoop.util.ReflectionUtils;
 
+@SuppressWarnings({"deprecation"})
 class Fetcher<K,V> extends Thread {
   
   private static final Log LOG = LogFactory.getLog(Fetcher.class);
@@ -88,6 +89,8 @@ class Fetcher<K,V> extends Thread {
   private final Decompressor decompressor;
   private final SecretKey jobTokenSecret;
 
+  private volatile boolean stopped = false;
+
   public Fetcher(JobConf job, TaskAttemptID reduceId, 
                  ShuffleScheduler<K,V> scheduler, MergeManager<K,V> merger,
                  Reporter reporter, ShuffleClientMetrics metrics,
@@ -135,7 +138,7 @@ class Fetcher<K,V> extends Thread {
   
   public void run() {
     try {
-      while (true && !Thread.currentThread().isInterrupted()) {
+      while (!stopped && !Thread.currentThread().isInterrupted()) {
         MapHost host = null;
         try {
           // If merge is on, block
@@ -160,7 +163,17 @@ class Fetcher<K,V> extends Thread {
       exceptionReporter.reportException(t);
     }
   }
-  
+
+  public void shutDown() throws InterruptedException {
+    this.stopped = true;
+    interrupt();
+    try {
+      join(5000);
+    } catch (InterruptedException ie) {
+      LOG.warn("Got interrupt while joining " + getName(), ie);
+    }
+  }
+
   /**
    * The crux of the matter...
    * 

+ 26 - 12
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/MergeManager.java

@@ -92,6 +92,7 @@ public class MergeManager<K, V> {
   
   private final long memoryLimit;
   private long usedMemory;
+  private long commitMemory;
   private final long maxSingleShuffleLimit;
   
   private final int memToMemMergeOutputsThreshold; 
@@ -181,6 +182,13 @@ public class MergeManager<K, V> {
              "ioSortFactor=" + ioSortFactor + ", " +
              "memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold);
 
+    if (this.maxSingleShuffleLimit >= this.mergeThreshold) {
+      throw new RuntimeException("Invlaid configuration: "
+          + "maxSingleShuffleLimit should be less than mergeThreshold"
+          + "maxSingleShuffleLimit: " + this.maxSingleShuffleLimit
+          + "mergeThreshold: " + this.mergeThreshold);
+    }
+
     boolean allowMemToMemMerge = 
       jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false);
     if (allowMemToMemMerge) {
@@ -245,16 +253,16 @@ public class MergeManager<K, V> {
     // all the stalled threads
     
     if (usedMemory > memoryLimit) {
-      LOG.debug(mapId + ": Stalling shuffle since usedMemory (" + usedMemory + 
-               ") is greater than memoryLimit (" + memoryLimit + ")"); 
-      
+      LOG.debug(mapId + ": Stalling shuffle since usedMemory (" + usedMemory
+          + ") is greater than memoryLimit (" + memoryLimit + ")." + 
+          " CommitMemory is (" + commitMemory + ")"); 
       return stallShuffle;
     }
     
     // Allow the in-memory shuffle to progress
-    LOG.debug(mapId + ": Proceeding with shuffle since usedMemory (" +
-        usedMemory + 
-        ") is lesser than memoryLimit (" + memoryLimit + ")"); 
+    LOG.debug(mapId + ": Proceeding with shuffle since usedMemory ("
+        + usedMemory + ") is lesser than memoryLimit (" + memoryLimit + ")."
+        + "CommitMemory is (" + commitMemory + ")"); 
     return unconditionalReserve(mapId, requestedSize, true);
   }
   
@@ -270,18 +278,24 @@ public class MergeManager<K, V> {
   }
   
   synchronized void unreserve(long size) {
+    commitMemory -= size;
     usedMemory -= size;
   }
-  
+
   public synchronized void closeInMemoryFile(MapOutput<K,V> mapOutput) { 
     inMemoryMapOutputs.add(mapOutput);
     LOG.info("closeInMemoryFile -> map-output of size: " + mapOutput.getSize()
-        + ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size());
-    
+        + ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size()
+        + ", commitMemory -> " + commitMemory + ", usedMemory ->" + usedMemory);
+
+    commitMemory+= mapOutput.getSize();
+
     synchronized (inMemoryMerger) {
-      if (!inMemoryMerger.isInProgress() && usedMemory >= mergeThreshold) {
-        LOG.info("Starting inMemoryMerger's merge since usedMemory=" +
-            usedMemory + " > mergeThreshold=" + mergeThreshold);
+      // Can hang if mergeThreshold is really low.
+      if (!inMemoryMerger.isInProgress() && commitMemory >= mergeThreshold) {
+        LOG.info("Starting inMemoryMerger's merge since commitMemory=" +
+            commitMemory + " > mergeThreshold=" + mergeThreshold + 
+            ". Current usedMemory=" + usedMemory);
         inMemoryMapOutputs.addAll(inMemoryMergedMapOutputs);
         inMemoryMergedMapOutputs.clear();
         inMemoryMerger.startMerge(inMemoryMapOutputs);

+ 4 - 15
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/Shuffle.java

@@ -19,8 +19,6 @@ package org.apache.hadoop.mapreduce.task.reduce;
 
 import java.io.IOException;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.FileSystem;
@@ -33,17 +31,17 @@ import org.apache.hadoop.mapred.RawKeyValueIterator;
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.Task;
+import org.apache.hadoop.mapred.Task.CombineOutputCollector;
 import org.apache.hadoop.mapred.TaskStatus;
 import org.apache.hadoop.mapred.TaskUmbilicalProtocol;
-import org.apache.hadoop.mapred.Task.CombineOutputCollector;
 import org.apache.hadoop.mapreduce.MRJobConfig;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.hadoop.util.Progress;
 
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
+@SuppressWarnings({"deprecation", "unchecked", "rawtypes"})
 public class Shuffle<K, V> implements ExceptionReporter {
-  private static final Log LOG = LogFactory.getLog(Shuffle.class);
   private static final int PROGRESS_FREQUENCY = 2000;
   
   private final TaskAttemptID reduceId;
@@ -100,7 +98,6 @@ public class Shuffle<K, V> implements ExceptionReporter {
                                     this, mergePhase, mapOutputFile);
   }
 
-  @SuppressWarnings("unchecked")
   public RawKeyValueIterator run() throws IOException, InterruptedException {
     // Start the map-completion events fetcher thread
     final EventFetcher<K,V> eventFetcher = 
@@ -130,19 +127,11 @@ public class Shuffle<K, V> implements ExceptionReporter {
     }
 
     // Stop the event-fetcher thread
-    eventFetcher.interrupt();
-    try {
-      eventFetcher.join();
-    } catch(Throwable t) {
-      LOG.info("Failed to stop " + eventFetcher.getName(), t);
-    }
+    eventFetcher.shutDown();
     
     // Stop the map-output fetcher threads
     for (Fetcher<K,V> fetcher : fetchers) {
-      fetcher.interrupt();
-    }
-    for (Fetcher<K,V> fetcher : fetchers) {
-      fetcher.join();
+      fetcher.shutDown();
     }
     fetchers = null;
     

+ 7 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml

@@ -102,6 +102,13 @@
             <phase>test-compile</phase>
           </execution>
         </executions>
+        <configuration>       
+         <archive>
+          <manifest>
+           <mainClass>org.apache.hadoop.test.MapredTestDriver</mainClass>
+         </manifest>
+         </archive>
+        </configuration>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>

+ 0 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/main/java/org/apache/hadoop/mapred/ClientServiceDelegate.java

@@ -175,7 +175,6 @@ public class ClientServiceDelegate {
                 + ":" + addr.getPort()));
             newUgi.addToken(clientToken);
           }
-          LOG.info("The url to track the job: " + application.getTrackingUrl());
           LOG.debug("Connecting to " + serviceAddr);
           final String tempStr = serviceAddr;
           realProxy = newUgi.doAs(new PrivilegedExceptionAction<MRClientProtocol>() {

+ 1 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/main/java/org/apache/hadoop/mapred/YARNRunner.java

@@ -406,7 +406,7 @@ public class YARNRunner implements ClientProtocol {
     // Setup the CLASSPATH in environment
     // i.e. add { job jar, CWD, Hadoop jars} to classpath.
     Map<String, String> environment = new HashMap<String, String>();
-    MRApps.setClasspath(environment);
+    MRApps.setClasspath(environment, conf);
 
     // Parse distributed cache
     MRApps.setupDistributedCache(jobConf, localResources);

+ 1 - 1
hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/mapred/GenericMRLoadGenerator.java → hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/GenericMRLoadGenerator.java

@@ -29,7 +29,6 @@ import java.util.Stack;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.examples.RandomTextWriter;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -40,6 +39,7 @@ import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.io.WritableUtils;
 import org.apache.hadoop.mapred.lib.NullOutputFormat;
+import org.apache.hadoop.mapreduce.RandomTextWriter;
 import org.apache.hadoop.util.GenericOptionsParser;
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.Tool;

+ 101 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java

@@ -17,6 +17,10 @@
  */
 package org.apache.hadoop.mapred;
 
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
 import java.io.DataOutputStream;
 import java.io.IOException;
 
@@ -32,6 +36,7 @@ import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.io.Text;
 
+@SuppressWarnings("deprecation")
 public class TestFileInputFormat extends TestCase {
 
   Configuration conf = new Configuration();
@@ -186,6 +191,102 @@ public class TestFileInputFormat extends TestCase {
     assertEquals(splits.length, 2);
   }
 
+  @SuppressWarnings("rawtypes")
+  public void testLastInputSplitAtSplitBoundary() throws Exception {
+    FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024,
+        128l * 1024 * 1024);
+    JobConf job = new JobConf();
+    InputSplit[] splits = fif.getSplits(job, 8);
+    assertEquals(8, splits.length);
+    for (int i = 0; i < splits.length; i++) {
+      InputSplit split = splits[i];
+      assertEquals(("host" + i), split.getLocations()[0]);
+    }
+  }
+
+  @SuppressWarnings("rawtypes")
+  public void testLastInputSplitExceedingSplitBoundary() throws Exception {
+    FileInputFormat fif = new FileInputFormatForTest(1027l * 1024 * 1024,
+        128l * 1024 * 1024);
+    JobConf job = new JobConf();
+    InputSplit[] splits = fif.getSplits(job, 8);
+    assertEquals(8, splits.length);
+    for (int i = 0; i < splits.length; i++) {
+      InputSplit split = splits[i];
+      assertEquals(("host" + i), split.getLocations()[0]);
+    }
+  }
+
+  @SuppressWarnings("rawtypes")
+  public void testLastInputSplitSingleSplit() throws Exception {
+    FileInputFormat fif = new FileInputFormatForTest(100l * 1024 * 1024,
+        128l * 1024 * 1024);
+    JobConf job = new JobConf();
+    InputSplit[] splits = fif.getSplits(job, 1);
+    assertEquals(1, splits.length);
+    for (int i = 0; i < splits.length; i++) {
+      InputSplit split = splits[i];
+      assertEquals(("host" + i), split.getLocations()[0]);
+    }
+  }
+
+  private class FileInputFormatForTest<K, V> extends FileInputFormat<K, V> {
+
+    long splitSize;
+    long length;
+
+    FileInputFormatForTest(long length, long splitSize) {
+      this.length = length;
+      this.splitSize = splitSize;
+    }
+
+    @Override
+    public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job,
+        Reporter reporter) throws IOException {
+      return null;
+    }
+
+    @Override
+    protected FileStatus[] listStatus(JobConf job) throws IOException {
+      FileStatus mockFileStatus = mock(FileStatus.class);
+      when(mockFileStatus.getBlockSize()).thenReturn(splitSize);
+      when(mockFileStatus.isDirectory()).thenReturn(false);
+      Path mockPath = mock(Path.class);
+      FileSystem mockFs = mock(FileSystem.class);
+
+      BlockLocation[] blockLocations = mockBlockLocations(length, splitSize);
+      when(mockFs.getFileBlockLocations(mockFileStatus, 0, length)).thenReturn(
+          blockLocations);
+      when(mockPath.getFileSystem(any(Configuration.class))).thenReturn(mockFs);
+
+      when(mockFileStatus.getPath()).thenReturn(mockPath);
+      when(mockFileStatus.getLen()).thenReturn(length);
+
+      FileStatus[] fs = new FileStatus[1];
+      fs[0] = mockFileStatus;
+      return fs;
+    }
+
+    @Override
+    protected long computeSplitSize(long blockSize, long minSize, long maxSize) {
+      return splitSize;
+    }
+
+    private BlockLocation[] mockBlockLocations(long size, long splitSize) {
+      int numLocations = (int) (size / splitSize);
+      if (size % splitSize != 0)
+        numLocations++;
+      BlockLocation[] blockLocations = new BlockLocation[numLocations];
+      for (int i = 0; i < numLocations; i++) {
+        String[] names = new String[] { "b" + i };
+        String[] hosts = new String[] { "host" + i };
+        blockLocations[i] = new BlockLocation(names, hosts, i * splitSize,
+            Math.min(splitSize, size - (splitSize * i)));
+      }
+      return blockLocations;
+    }
+  }
+
   static void writeFile(Configuration conf, Path name,
       short replication, int numBlocks) throws IOException {
     FileSystem fileSys = FileSystem.get(conf);

+ 0 - 1
hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/mapred/ThreadedMapBenchmark.java → hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/ThreadedMapBenchmark.java

@@ -25,7 +25,6 @@ import java.util.Random;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.examples.RandomWriter;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.BytesWritable;

+ 0 - 1
hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/mapreduce/GenericMRLoadGenerator.java → hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/GenericMRLoadGenerator.java

@@ -29,7 +29,6 @@ import java.util.Stack;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.examples.RandomTextWriter;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;

+ 757 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/RandomTextWriter.java

@@ -0,0 +1,757 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.ClusterStatus;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * This program uses map/reduce to just run a distributed job where there is
+ * no interaction between the tasks and each task writes a large unsorted
+ * random sequence of words.
+ * In order for this program to generate data for terasort with a 5-10 words
+ * per key and 20-100 words per value, have the following config:
+ * <xmp>
+ * <?xml version="1.0"?>
+ * <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+ * <configuration>
+ *   <property>
+ *     <name>mapreduce.randomtextwriter.minwordskey</name>
+ *     <value>5</value>
+ *   </property>
+ *   <property>
+ *     <name>mapreduce.randomtextwriter.maxwordskey</name>
+ *     <value>10</value>
+ *   </property>
+ *   <property>
+ *     <name>mapreduce.randomtextwriter.minwordsvalue</name>
+ *     <value>20</value>
+ *   </property>
+ *   <property>
+ *     <name>mapreduce.randomtextwriter.maxwordsvalue</name>
+ *     <value>100</value>
+ *   </property>
+ *   <property>
+ *     <name>mapreduce.randomtextwriter.totalbytes</name>
+ *     <value>1099511627776</value>
+ *   </property>
+ * </configuration></xmp>
+ * 
+ * Equivalently, {@link RandomTextWriter} also supports all the above options
+ * and ones supported by {@link Tool} via the command-line.
+ * 
+ * To run: bin/hadoop jar hadoop-${version}-examples.jar randomtextwriter
+ *            [-outFormat <i>output format class</i>] <i>output</i> 
+ */
+public class RandomTextWriter extends Configured implements Tool {
+  public static final String TOTAL_BYTES = 
+    "mapreduce.randomtextwriter.totalbytes";
+  public static final String BYTES_PER_MAP = 
+    "mapreduce.randomtextwriter.bytespermap";
+  public static final String MAPS_PER_HOST = 
+    "mapreduce.randomtextwriter.mapsperhost";
+  public static final String MAX_VALUE = "mapreduce.randomtextwriter.maxwordsvalue";
+  public static final String MIN_VALUE = "mapreduce.randomtextwriter.minwordsvalue";
+  public static final String MIN_KEY = "mapreduce.randomtextwriter.minwordskey";
+  public static final String MAX_KEY = "mapreduce.randomtextwriter.maxwordskey";
+  
+  static int printUsage() {
+    System.out.println("randomtextwriter " +
+                       "[-outFormat <output format class>] " + 
+                       "<output>");
+    ToolRunner.printGenericCommandUsage(System.out);
+    return 2;
+  }
+  
+  /**
+   * User counters
+   */
+  static enum Counters { RECORDS_WRITTEN, BYTES_WRITTEN }
+
+  static class RandomTextMapper extends Mapper<Text, Text, Text, Text> {
+    
+    private long numBytesToWrite;
+    private int minWordsInKey;
+    private int wordsInKeyRange;
+    private int minWordsInValue;
+    private int wordsInValueRange;
+    private Random random = new Random();
+    
+    /**
+     * Save the configuration value that we need to write the data.
+     */
+    public void setup(Context context) {
+      Configuration conf = context.getConfiguration();
+      numBytesToWrite = conf.getLong(BYTES_PER_MAP,
+                                    1*1024*1024*1024);
+      minWordsInKey = conf.getInt(MIN_KEY, 5);
+      wordsInKeyRange = (conf.getInt(MAX_KEY, 10) - minWordsInKey);
+      minWordsInValue = conf.getInt(MIN_VALUE, 10);
+      wordsInValueRange = (conf.getInt(MAX_VALUE, 100) - minWordsInValue);
+    }
+    
+    /**
+     * Given an output filename, write a bunch of random records to it.
+     */
+    public void map(Text key, Text value,
+                    Context context) throws IOException,InterruptedException {
+      int itemCount = 0;
+      while (numBytesToWrite > 0) {
+        // Generate the key/value 
+        int noWordsKey = minWordsInKey + 
+          (wordsInKeyRange != 0 ? random.nextInt(wordsInKeyRange) : 0);
+        int noWordsValue = minWordsInValue + 
+          (wordsInValueRange != 0 ? random.nextInt(wordsInValueRange) : 0);
+        Text keyWords = generateSentence(noWordsKey);
+        Text valueWords = generateSentence(noWordsValue);
+        
+        // Write the sentence 
+        context.write(keyWords, valueWords);
+        
+        numBytesToWrite -= (keyWords.getLength() + valueWords.getLength());
+        
+        // Update counters, progress etc.
+        context.getCounter(Counters.BYTES_WRITTEN).increment(
+                  keyWords.getLength() + valueWords.getLength());
+        context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
+        if (++itemCount % 200 == 0) {
+          context.setStatus("wrote record " + itemCount + ". " + 
+                             numBytesToWrite + " bytes left.");
+        }
+      }
+      context.setStatus("done with " + itemCount + " records.");
+    }
+    
+    private Text generateSentence(int noWords) {
+      StringBuffer sentence = new StringBuffer();
+      String space = " ";
+      for (int i=0; i < noWords; ++i) {
+        sentence.append(words[random.nextInt(words.length)]);
+        sentence.append(space);
+      }
+      return new Text(sentence.toString());
+    }
+  }
+  
+  /**
+   * This is the main routine for launching a distributed random write job.
+   * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
+   * The reduce doesn't do anything.
+   * 
+   * @throws IOException 
+   */
+  public int run(String[] args) throws Exception {    
+    if (args.length == 0) {
+      return printUsage();    
+    }
+    
+    Configuration conf = getConf();
+    JobClient client = new JobClient(conf);
+    ClusterStatus cluster = client.getClusterStatus();
+    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
+    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
+                                             1*1024*1024*1024);
+    if (numBytesToWritePerMap == 0) {
+      System.err.println("Cannot have " + BYTES_PER_MAP +" set to 0");
+      return -2;
+    }
+    long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
+         numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
+    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
+    if (numMaps == 0 && totalBytesToWrite > 0) {
+      numMaps = 1;
+      conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
+    }
+    conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
+    
+    Job job = new Job(conf);
+    
+    job.setJarByClass(RandomTextWriter.class);
+    job.setJobName("random-text-writer");
+    
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Text.class);
+    
+    job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
+    job.setMapperClass(RandomTextMapper.class);        
+    
+    Class<? extends OutputFormat> outputFormatClass = 
+      SequenceFileOutputFormat.class;
+    List<String> otherArgs = new ArrayList<String>();
+    for(int i=0; i < args.length; ++i) {
+      try {
+        if ("-outFormat".equals(args[i])) {
+          outputFormatClass = 
+            Class.forName(args[++i]).asSubclass(OutputFormat.class);
+        } else {
+          otherArgs.add(args[i]);
+        }
+      } catch (ArrayIndexOutOfBoundsException except) {
+        System.out.println("ERROR: Required parameter missing from " +
+            args[i-1]);
+        return printUsage(); // exits
+      }
+    }
+
+    job.setOutputFormatClass(outputFormatClass);
+    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
+    
+    System.out.println("Running " + numMaps + " maps.");
+    
+    // reducer NONE
+    job.setNumReduceTasks(0);
+    
+    Date startTime = new Date();
+    System.out.println("Job started: " + startTime);
+    int ret = job.waitForCompletion(true) ? 0 : 1;
+    Date endTime = new Date();
+    System.out.println("Job ended: " + endTime);
+    System.out.println("The job took " + 
+                       (endTime.getTime() - startTime.getTime()) /1000 + 
+                       " seconds.");
+    
+    return ret;
+  }
+  
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new RandomTextWriter(), args);
+    System.exit(res);
+  }
+
+  /**
+   * A random list of 100 words from /usr/share/dict/words
+   */
+  private static String[] words = {
+                                   "diurnalness", "Homoiousian",
+                                   "spiranthic", "tetragynian",
+                                   "silverhead", "ungreat",
+                                   "lithograph", "exploiter",
+                                   "physiologian", "by",
+                                   "hellbender", "Filipendula",
+                                   "undeterring", "antiscolic",
+                                   "pentagamist", "hypoid",
+                                   "cacuminal", "sertularian",
+                                   "schoolmasterism", "nonuple",
+                                   "gallybeggar", "phytonic",
+                                   "swearingly", "nebular",
+                                   "Confervales", "thermochemically",
+                                   "characinoid", "cocksuredom",
+                                   "fallacious", "feasibleness",
+                                   "debromination", "playfellowship",
+                                   "tramplike", "testa",
+                                   "participatingly", "unaccessible",
+                                   "bromate", "experientialist",
+                                   "roughcast", "docimastical",
+                                   "choralcelo", "blightbird",
+                                   "peptonate", "sombreroed",
+                                   "unschematized", "antiabolitionist",
+                                   "besagne", "mastication",
+                                   "bromic", "sviatonosite",
+                                   "cattimandoo", "metaphrastical",
+                                   "endotheliomyoma", "hysterolysis",
+                                   "unfulminated", "Hester",
+                                   "oblongly", "blurredness",
+                                   "authorling", "chasmy",
+                                   "Scorpaenidae", "toxihaemia",
+                                   "Dictograph", "Quakerishly",
+                                   "deaf", "timbermonger",
+                                   "strammel", "Thraupidae",
+                                   "seditious", "plerome",
+                                   "Arneb", "eristically",
+                                   "serpentinic", "glaumrie",
+                                   "socioromantic", "apocalypst",
+                                   "tartrous", "Bassaris",
+                                   "angiolymphoma", "horsefly",
+                                   "kenno", "astronomize",
+                                   "euphemious", "arsenide",
+                                   "untongued", "parabolicness",
+                                   "uvanite", "helpless",
+                                   "gemmeous", "stormy",
+                                   "templar", "erythrodextrin",
+                                   "comism", "interfraternal",
+                                   "preparative", "parastas",
+                                   "frontoorbital", "Ophiosaurus",
+                                   "diopside", "serosanguineous",
+                                   "ununiformly", "karyological",
+                                   "collegian", "allotropic",
+                                   "depravity", "amylogenesis",
+                                   "reformatory", "epidymides",
+                                   "pleurotropous", "trillium",
+                                   "dastardliness", "coadvice",
+                                   "embryotic", "benthonic",
+                                   "pomiferous", "figureheadship",
+                                   "Megaluridae", "Harpa",
+                                   "frenal", "commotion",
+                                   "abthainry", "cobeliever",
+                                   "manilla", "spiciferous",
+                                   "nativeness", "obispo",
+                                   "monilioid", "biopsic",
+                                   "valvula", "enterostomy",
+                                   "planosubulate", "pterostigma",
+                                   "lifter", "triradiated",
+                                   "venialness", "tum",
+                                   "archistome", "tautness",
+                                   "unswanlike", "antivenin",
+                                   "Lentibulariaceae", "Triphora",
+                                   "angiopathy", "anta",
+                                   "Dawsonia", "becomma",
+                                   "Yannigan", "winterproof",
+                                   "antalgol", "harr",
+                                   "underogating", "ineunt",
+                                   "cornberry", "flippantness",
+                                   "scyphostoma", "approbation",
+                                   "Ghent", "Macraucheniidae",
+                                   "scabbiness", "unanatomized",
+                                   "photoelasticity", "eurythermal",
+                                   "enation", "prepavement",
+                                   "flushgate", "subsequentially",
+                                   "Edo", "antihero",
+                                   "Isokontae", "unforkedness",
+                                   "porriginous", "daytime",
+                                   "nonexecutive", "trisilicic",
+                                   "morphiomania", "paranephros",
+                                   "botchedly", "impugnation",
+                                   "Dodecatheon", "obolus",
+                                   "unburnt", "provedore",
+                                   "Aktistetae", "superindifference",
+                                   "Alethea", "Joachimite",
+                                   "cyanophilous", "chorograph",
+                                   "brooky", "figured",
+                                   "periclitation", "quintette",
+                                   "hondo", "ornithodelphous",
+                                   "unefficient", "pondside",
+                                   "bogydom", "laurinoxylon",
+                                   "Shiah", "unharmed",
+                                   "cartful", "noncrystallized",
+                                   "abusiveness", "cromlech",
+                                   "japanned", "rizzomed",
+                                   "underskin", "adscendent",
+                                   "allectory", "gelatinousness",
+                                   "volcano", "uncompromisingly",
+                                   "cubit", "idiotize",
+                                   "unfurbelowed", "undinted",
+                                   "magnetooptics", "Savitar",
+                                   "diwata", "ramosopalmate",
+                                   "Pishquow", "tomorn",
+                                   "apopenptic", "Haversian",
+                                   "Hysterocarpus", "ten",
+                                   "outhue", "Bertat",
+                                   "mechanist", "asparaginic",
+                                   "velaric", "tonsure",
+                                   "bubble", "Pyrales",
+                                   "regardful", "glyphography",
+                                   "calabazilla", "shellworker",
+                                   "stradametrical", "havoc",
+                                   "theologicopolitical", "sawdust",
+                                   "diatomaceous", "jajman",
+                                   "temporomastoid", "Serrifera",
+                                   "Ochnaceae", "aspersor",
+                                   "trailmaking", "Bishareen",
+                                   "digitule", "octogynous",
+                                   "epididymitis", "smokefarthings",
+                                   "bacillite", "overcrown",
+                                   "mangonism", "sirrah",
+                                   "undecorated", "psychofugal",
+                                   "bismuthiferous", "rechar",
+                                   "Lemuridae", "frameable",
+                                   "thiodiazole", "Scanic",
+                                   "sportswomanship", "interruptedness",
+                                   "admissory", "osteopaedion",
+                                   "tingly", "tomorrowness",
+                                   "ethnocracy", "trabecular",
+                                   "vitally", "fossilism",
+                                   "adz", "metopon",
+                                   "prefatorial", "expiscate",
+                                   "diathermacy", "chronist",
+                                   "nigh", "generalizable",
+                                   "hysterogen", "aurothiosulphuric",
+                                   "whitlowwort", "downthrust",
+                                   "Protestantize", "monander",
+                                   "Itea", "chronographic",
+                                   "silicize", "Dunlop",
+                                   "eer", "componental",
+                                   "spot", "pamphlet",
+                                   "antineuritic", "paradisean",
+                                   "interruptor", "debellator",
+                                   "overcultured", "Florissant",
+                                   "hyocholic", "pneumatotherapy",
+                                   "tailoress", "rave",
+                                   "unpeople", "Sebastian",
+                                   "thermanesthesia", "Coniferae",
+                                   "swacking", "posterishness",
+                                   "ethmopalatal", "whittle",
+                                   "analgize", "scabbardless",
+                                   "naught", "symbiogenetically",
+                                   "trip", "parodist",
+                                   "columniform", "trunnel",
+                                   "yawler", "goodwill",
+                                   "pseudohalogen", "swangy",
+                                   "cervisial", "mediateness",
+                                   "genii", "imprescribable",
+                                   "pony", "consumptional",
+                                   "carposporangial", "poleax",
+                                   "bestill", "subfebrile",
+                                   "sapphiric", "arrowworm",
+                                   "qualminess", "ultraobscure",
+                                   "thorite", "Fouquieria",
+                                   "Bermudian", "prescriber",
+                                   "elemicin", "warlike",
+                                   "semiangle", "rotular",
+                                   "misthread", "returnability",
+                                   "seraphism", "precostal",
+                                   "quarried", "Babylonism",
+                                   "sangaree", "seelful",
+                                   "placatory", "pachydermous",
+                                   "bozal", "galbulus",
+                                   "spermaphyte", "cumbrousness",
+                                   "pope", "signifier",
+                                   "Endomycetaceae", "shallowish",
+                                   "sequacity", "periarthritis",
+                                   "bathysphere", "pentosuria",
+                                   "Dadaism", "spookdom",
+                                   "Consolamentum", "afterpressure",
+                                   "mutter", "louse",
+                                   "ovoviviparous", "corbel",
+                                   "metastoma", "biventer",
+                                   "Hydrangea", "hogmace",
+                                   "seizing", "nonsuppressed",
+                                   "oratorize", "uncarefully",
+                                   "benzothiofuran", "penult",
+                                   "balanocele", "macropterous",
+                                   "dishpan", "marten",
+                                   "absvolt", "jirble",
+                                   "parmelioid", "airfreighter",
+                                   "acocotl", "archesporial",
+                                   "hypoplastral", "preoral",
+                                   "quailberry", "cinque",
+                                   "terrestrially", "stroking",
+                                   "limpet", "moodishness",
+                                   "canicule", "archididascalian",
+                                   "pompiloid", "overstaid",
+                                   "introducer", "Italical",
+                                   "Christianopaganism", "prescriptible",
+                                   "subofficer", "danseuse",
+                                   "cloy", "saguran",
+                                   "frictionlessly", "deindividualization",
+                                   "Bulanda", "ventricous",
+                                   "subfoliar", "basto",
+                                   "scapuloradial", "suspend",
+                                   "stiffish", "Sphenodontidae",
+                                   "eternal", "verbid",
+                                   "mammonish", "upcushion",
+                                   "barkometer", "concretion",
+                                   "preagitate", "incomprehensible",
+                                   "tristich", "visceral",
+                                   "hemimelus", "patroller",
+                                   "stentorophonic", "pinulus",
+                                   "kerykeion", "brutism",
+                                   "monstership", "merciful",
+                                   "overinstruct", "defensibly",
+                                   "bettermost", "splenauxe",
+                                   "Mormyrus", "unreprimanded",
+                                   "taver", "ell",
+                                   "proacquittal", "infestation",
+                                   "overwoven", "Lincolnlike",
+                                   "chacona", "Tamil",
+                                   "classificational", "lebensraum",
+                                   "reeveland", "intuition",
+                                   "Whilkut", "focaloid",
+                                   "Eleusinian", "micromembrane",
+                                   "byroad", "nonrepetition",
+                                   "bacterioblast", "brag",
+                                   "ribaldrous", "phytoma",
+                                   "counteralliance", "pelvimetry",
+                                   "pelf", "relaster",
+                                   "thermoresistant", "aneurism",
+                                   "molossic", "euphonym",
+                                   "upswell", "ladhood",
+                                   "phallaceous", "inertly",
+                                   "gunshop", "stereotypography",
+                                   "laryngic", "refasten",
+                                   "twinling", "oflete",
+                                   "hepatorrhaphy", "electrotechnics",
+                                   "cockal", "guitarist",
+                                   "topsail", "Cimmerianism",
+                                   "larklike", "Llandovery",
+                                   "pyrocatechol", "immatchable",
+                                   "chooser", "metrocratic",
+                                   "craglike", "quadrennial",
+                                   "nonpoisonous", "undercolored",
+                                   "knob", "ultratense",
+                                   "balladmonger", "slait",
+                                   "sialadenitis", "bucketer",
+                                   "magnificently", "unstipulated",
+                                   "unscourged", "unsupercilious",
+                                   "packsack", "pansophism",
+                                   "soorkee", "percent",
+                                   "subirrigate", "champer",
+                                   "metapolitics", "spherulitic",
+                                   "involatile", "metaphonical",
+                                   "stachyuraceous", "speckedness",
+                                   "bespin", "proboscidiform",
+                                   "gul", "squit",
+                                   "yeelaman", "peristeropode",
+                                   "opacousness", "shibuichi",
+                                   "retinize", "yote",
+                                   "misexposition", "devilwise",
+                                   "pumpkinification", "vinny",
+                                   "bonze", "glossing",
+                                   "decardinalize", "transcortical",
+                                   "serphoid", "deepmost",
+                                   "guanajuatite", "wemless",
+                                   "arval", "lammy",
+                                   "Effie", "Saponaria",
+                                   "tetrahedral", "prolificy",
+                                   "excerpt", "dunkadoo",
+                                   "Spencerism", "insatiately",
+                                   "Gilaki", "oratorship",
+                                   "arduousness", "unbashfulness",
+                                   "Pithecolobium", "unisexuality",
+                                   "veterinarian", "detractive",
+                                   "liquidity", "acidophile",
+                                   "proauction", "sural",
+                                   "totaquina", "Vichyite",
+                                   "uninhabitedness", "allegedly",
+                                   "Gothish", "manny",
+                                   "Inger", "flutist",
+                                   "ticktick", "Ludgatian",
+                                   "homotransplant", "orthopedical",
+                                   "diminutively", "monogoneutic",
+                                   "Kenipsim", "sarcologist",
+                                   "drome", "stronghearted",
+                                   "Fameuse", "Swaziland",
+                                   "alen", "chilblain",
+                                   "beatable", "agglomeratic",
+                                   "constitutor", "tendomucoid",
+                                   "porencephalous", "arteriasis",
+                                   "boser", "tantivy",
+                                   "rede", "lineamental",
+                                   "uncontradictableness", "homeotypical",
+                                   "masa", "folious",
+                                   "dosseret", "neurodegenerative",
+                                   "subtransverse", "Chiasmodontidae",
+                                   "palaeotheriodont", "unstressedly",
+                                   "chalcites", "piquantness",
+                                   "lampyrine", "Aplacentalia",
+                                   "projecting", "elastivity",
+                                   "isopelletierin", "bladderwort",
+                                   "strander", "almud",
+                                   "iniquitously", "theologal",
+                                   "bugre", "chargeably",
+                                   "imperceptivity", "meriquinoidal",
+                                   "mesophyte", "divinator",
+                                   "perfunctory", "counterappellant",
+                                   "synovial", "charioteer",
+                                   "crystallographical", "comprovincial",
+                                   "infrastapedial", "pleasurehood",
+                                   "inventurous", "ultrasystematic",
+                                   "subangulated", "supraoesophageal",
+                                   "Vaishnavism", "transude",
+                                   "chrysochrous", "ungrave",
+                                   "reconciliable", "uninterpleaded",
+                                   "erlking", "wherefrom",
+                                   "aprosopia", "antiadiaphorist",
+                                   "metoxazine", "incalculable",
+                                   "umbellic", "predebit",
+                                   "foursquare", "unimmortal",
+                                   "nonmanufacture", "slangy",
+                                   "predisputant", "familist",
+                                   "preaffiliate", "friarhood",
+                                   "corelysis", "zoonitic",
+                                   "halloo", "paunchy",
+                                   "neuromimesis", "aconitine",
+                                   "hackneyed", "unfeeble",
+                                   "cubby", "autoschediastical",
+                                   "naprapath", "lyrebird",
+                                   "inexistency", "leucophoenicite",
+                                   "ferrogoslarite", "reperuse",
+                                   "uncombable", "tambo",
+                                   "propodiale", "diplomatize",
+                                   "Russifier", "clanned",
+                                   "corona", "michigan",
+                                   "nonutilitarian", "transcorporeal",
+                                   "bought", "Cercosporella",
+                                   "stapedius", "glandularly",
+                                   "pictorially", "weism",
+                                   "disilane", "rainproof",
+                                   "Caphtor", "scrubbed",
+                                   "oinomancy", "pseudoxanthine",
+                                   "nonlustrous", "redesertion",
+                                   "Oryzorictinae", "gala",
+                                   "Mycogone", "reappreciate",
+                                   "cyanoguanidine", "seeingness",
+                                   "breadwinner", "noreast",
+                                   "furacious", "epauliere",
+                                   "omniscribent", "Passiflorales",
+                                   "uninductive", "inductivity",
+                                   "Orbitolina", "Semecarpus",
+                                   "migrainoid", "steprelationship",
+                                   "phlogisticate", "mesymnion",
+                                   "sloped", "edificator",
+                                   "beneficent", "culm",
+                                   "paleornithology", "unurban",
+                                   "throbless", "amplexifoliate",
+                                   "sesquiquintile", "sapience",
+                                   "astucious", "dithery",
+                                   "boor", "ambitus",
+                                   "scotching", "uloid",
+                                   "uncompromisingness", "hoove",
+                                   "waird", "marshiness",
+                                   "Jerusalem", "mericarp",
+                                   "unevoked", "benzoperoxide",
+                                   "outguess", "pyxie",
+                                   "hymnic", "euphemize",
+                                   "mendacity", "erythremia",
+                                   "rosaniline", "unchatteled",
+                                   "lienteria", "Bushongo",
+                                   "dialoguer", "unrepealably",
+                                   "rivethead", "antideflation",
+                                   "vinegarish", "manganosiderite",
+                                   "doubtingness", "ovopyriform",
+                                   "Cephalodiscus", "Muscicapa",
+                                   "Animalivora", "angina",
+                                   "planispheric", "ipomoein",
+                                   "cuproiodargyrite", "sandbox",
+                                   "scrat", "Munnopsidae",
+                                   "shola", "pentafid",
+                                   "overstudiousness", "times",
+                                   "nonprofession", "appetible",
+                                   "valvulotomy", "goladar",
+                                   "uniarticular", "oxyterpene",
+                                   "unlapsing", "omega",
+                                   "trophonema", "seminonflammable",
+                                   "circumzenithal", "starer",
+                                   "depthwise", "liberatress",
+                                   "unleavened", "unrevolting",
+                                   "groundneedle", "topline",
+                                   "wandoo", "umangite",
+                                   "ordinant", "unachievable",
+                                   "oversand", "snare",
+                                   "avengeful", "unexplicit",
+                                   "mustafina", "sonable",
+                                   "rehabilitative", "eulogization",
+                                   "papery", "technopsychology",
+                                   "impressor", "cresylite",
+                                   "entame", "transudatory",
+                                   "scotale", "pachydermatoid",
+                                   "imaginary", "yeat",
+                                   "slipped", "stewardship",
+                                   "adatom", "cockstone",
+                                   "skyshine", "heavenful",
+                                   "comparability", "exprobratory",
+                                   "dermorhynchous", "parquet",
+                                   "cretaceous", "vesperal",
+                                   "raphis", "undangered",
+                                   "Glecoma", "engrain",
+                                   "counteractively", "Zuludom",
+                                   "orchiocatabasis", "Auriculariales",
+                                   "warriorwise", "extraorganismal",
+                                   "overbuilt", "alveolite",
+                                   "tetchy", "terrificness",
+                                   "widdle", "unpremonished",
+                                   "rebilling", "sequestrum",
+                                   "equiconvex", "heliocentricism",
+                                   "catabaptist", "okonite",
+                                   "propheticism", "helminthagogic",
+                                   "calycular", "giantly",
+                                   "wingable", "golem",
+                                   "unprovided", "commandingness",
+                                   "greave", "haply",
+                                   "doina", "depressingly",
+                                   "subdentate", "impairment",
+                                   "decidable", "neurotrophic",
+                                   "unpredict", "bicorporeal",
+                                   "pendulant", "flatman",
+                                   "intrabred", "toplike",
+                                   "Prosobranchiata", "farrantly",
+                                   "toxoplasmosis", "gorilloid",
+                                   "dipsomaniacal", "aquiline",
+                                   "atlantite", "ascitic",
+                                   "perculsive", "prospectiveness",
+                                   "saponaceous", "centrifugalization",
+                                   "dinical", "infravaginal",
+                                   "beadroll", "affaite",
+                                   "Helvidian", "tickleproof",
+                                   "abstractionism", "enhedge",
+                                   "outwealth", "overcontribute",
+                                   "coldfinch", "gymnastic",
+                                   "Pincian", "Munychian",
+                                   "codisjunct", "quad",
+                                   "coracomandibular", "phoenicochroite",
+                                   "amender", "selectivity",
+                                   "putative", "semantician",
+                                   "lophotrichic", "Spatangoidea",
+                                   "saccharogenic", "inferent",
+                                   "Triconodonta", "arrendation",
+                                   "sheepskin", "taurocolla",
+                                   "bunghole", "Machiavel",
+                                   "triakistetrahedral", "dehairer",
+                                   "prezygapophysial", "cylindric",
+                                   "pneumonalgia", "sleigher",
+                                   "emir", "Socraticism",
+                                   "licitness", "massedly",
+                                   "instructiveness", "sturdied",
+                                   "redecrease", "starosta",
+                                   "evictor", "orgiastic",
+                                   "squdge", "meloplasty",
+                                   "Tsonecan", "repealableness",
+                                   "swoony", "myesthesia",
+                                   "molecule", "autobiographist",
+                                   "reciprocation", "refective",
+                                   "unobservantness", "tricae",
+                                   "ungouged", "floatability",
+                                   "Mesua", "fetlocked",
+                                   "chordacentrum", "sedentariness",
+                                   "various", "laubanite",
+                                   "nectopod", "zenick",
+                                   "sequentially", "analgic",
+                                   "biodynamics", "posttraumatic",
+                                   "nummi", "pyroacetic",
+                                   "bot", "redescend",
+                                   "dispermy", "undiffusive",
+                                   "circular", "trillion",
+                                   "Uraniidae", "ploration",
+                                   "discipular", "potentness",
+                                   "sud", "Hu",
+                                   "Eryon", "plugger",
+                                   "subdrainage", "jharal",
+                                   "abscission", "supermarket",
+                                   "countergabion", "glacierist",
+                                   "lithotresis", "minniebush",
+                                   "zanyism", "eucalypteol",
+                                   "sterilely", "unrealize",
+                                   "unpatched", "hypochondriacism",
+                                   "critically", "cheesecutter",
+                                  };
+}

+ 298 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/RandomWriter.java

@@ -0,0 +1,298 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.ClusterStatus;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * This program uses map/reduce to just run a distributed job where there is
+ * no interaction between the tasks and each task write a large unsorted
+ * random binary sequence file of BytesWritable.
+ * In order for this program to generate data for terasort with 10-byte keys
+ * and 90-byte values, have the following config:
+ * <xmp>
+ * <?xml version="1.0"?>
+ * <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+ * <configuration>
+ *   <property>
+ *     <name>mapreduce.randomwriter.minkey</name>
+ *     <value>10</value>
+ *   </property>
+ *   <property>
+ *     <name>mapreduce.randomwriter.maxkey</name>
+ *     <value>10</value>
+ *   </property>
+ *   <property>
+ *     <name>mapreduce.randomwriter.minvalue</name>
+ *     <value>90</value>
+ *   </property>
+ *   <property>
+ *     <name>mapreduce.randomwriter.maxvalue</name>
+ *     <value>90</value>
+ *   </property>
+ *   <property>
+ *     <name>mapreduce.randomwriter.totalbytes</name>
+ *     <value>1099511627776</value>
+ *   </property>
+ * </configuration></xmp>
+ * 
+ * Equivalently, {@link RandomWriter} also supports all the above options
+ * and ones supported by {@link GenericOptionsParser} via the command-line.
+ */
+public class RandomWriter extends Configured implements Tool {
+  public static final String TOTAL_BYTES = "mapreduce.randomwriter.totalbytes";
+  public static final String BYTES_PER_MAP = 
+    "mapreduce.randomwriter.bytespermap";
+  public static final String MAPS_PER_HOST = 
+    "mapreduce.randomwriter.mapsperhost";
+  public static final String MAX_VALUE = "mapreduce.randomwriter.maxvalue";
+  public static final String MIN_VALUE = "mapreduce.randomwriter.minvalue";
+  public static final String MIN_KEY = "mapreduce.randomwriter.minkey";
+  public static final String MAX_KEY = "mapreduce.randomwriter.maxkey";
+  
+  /**
+   * User counters
+   */
+  static enum Counters { RECORDS_WRITTEN, BYTES_WRITTEN }
+  
+  /**
+   * A custom input format that creates virtual inputs of a single string
+   * for each map.
+   */
+  static class RandomInputFormat extends InputFormat<Text, Text> {
+
+    /** 
+     * Generate the requested number of file splits, with the filename
+     * set to the filename of the output file.
+     */
+    public List<InputSplit> getSplits(JobContext job) throws IOException {
+      List<InputSplit> result = new ArrayList<InputSplit>();
+      Path outDir = FileOutputFormat.getOutputPath(job);
+      int numSplits = 
+            job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
+      for(int i=0; i < numSplits; ++i) {
+        result.add(new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, 
+                                  (String[])null));
+      }
+      return result;
+    }
+
+    /**
+     * Return a single record (filename, "") where the filename is taken from
+     * the file split.
+     */
+    static class RandomRecordReader extends RecordReader<Text, Text> {
+      Path name;
+      Text key = null;
+      Text value = new Text();
+      public RandomRecordReader(Path p) {
+        name = p;
+      }
+      
+      public void initialize(InputSplit split,
+                             TaskAttemptContext context)
+      throws IOException, InterruptedException {
+    	  
+      }
+      
+      public boolean nextKeyValue() {
+        if (name != null) {
+          key = new Text();
+          key.set(name.getName());
+          name = null;
+          return true;
+        }
+        return false;
+      }
+      
+      public Text getCurrentKey() {
+        return key;
+      }
+      
+      public Text getCurrentValue() {
+        return value;
+      }
+      
+      public void close() {}
+
+      public float getProgress() {
+        return 0.0f;
+      }
+    }
+
+    public RecordReader<Text, Text> createRecordReader(InputSplit split,
+        TaskAttemptContext context) throws IOException, InterruptedException {
+      return new RandomRecordReader(((FileSplit) split).getPath());
+    }
+  }
+
+  static class RandomMapper extends Mapper<WritableComparable, Writable,
+                      BytesWritable, BytesWritable> {
+    
+    private long numBytesToWrite;
+    private int minKeySize;
+    private int keySizeRange;
+    private int minValueSize;
+    private int valueSizeRange;
+    private Random random = new Random();
+    private BytesWritable randomKey = new BytesWritable();
+    private BytesWritable randomValue = new BytesWritable();
+    
+    private void randomizeBytes(byte[] data, int offset, int length) {
+      for(int i=offset + length - 1; i >= offset; --i) {
+        data[i] = (byte) random.nextInt(256);
+      }
+    }
+    
+    /**
+     * Given an output filename, write a bunch of random records to it.
+     */
+    public void map(WritableComparable key, 
+                    Writable value,
+                    Context context) throws IOException,InterruptedException {
+      int itemCount = 0;
+      while (numBytesToWrite > 0) {
+        int keyLength = minKeySize + 
+          (keySizeRange != 0 ? random.nextInt(keySizeRange) : 0);
+        randomKey.setSize(keyLength);
+        randomizeBytes(randomKey.getBytes(), 0, randomKey.getLength());
+        int valueLength = minValueSize +
+          (valueSizeRange != 0 ? random.nextInt(valueSizeRange) : 0);
+        randomValue.setSize(valueLength);
+        randomizeBytes(randomValue.getBytes(), 0, randomValue.getLength());
+        context.write(randomKey, randomValue);
+        numBytesToWrite -= keyLength + valueLength;
+        context.getCounter(Counters.BYTES_WRITTEN).increment(keyLength + valueLength);
+        context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
+        if (++itemCount % 200 == 0) {
+          context.setStatus("wrote record " + itemCount + ". " + 
+                             numBytesToWrite + " bytes left.");
+        }
+      }
+      context.setStatus("done with " + itemCount + " records.");
+    }
+    
+    /**
+     * Save the values out of the configuaration that we need to write
+     * the data.
+     */
+    @Override
+    public void setup(Context context) {
+      Configuration conf = context.getConfiguration();
+      numBytesToWrite = conf.getLong(BYTES_PER_MAP,
+                                    1*1024*1024*1024);
+      minKeySize = conf.getInt(MIN_KEY, 10);
+      keySizeRange = 
+        conf.getInt(MAX_KEY, 1000) - minKeySize;
+      minValueSize = conf.getInt(MIN_VALUE, 0);
+      valueSizeRange = 
+        conf.getInt(MAX_VALUE, 20000) - minValueSize;
+    }
+  }
+  
+  /**
+   * This is the main routine for launching a distributed random write job.
+   * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
+   * The reduce doesn't do anything.
+   * 
+   * @throws IOException 
+   */
+  public int run(String[] args) throws Exception {    
+    if (args.length == 0) {
+      System.out.println("Usage: writer <out-dir>");
+      ToolRunner.printGenericCommandUsage(System.out);
+      return 2;
+    }
+    
+    Path outDir = new Path(args[0]);
+    Configuration conf = getConf();
+    JobClient client = new JobClient(conf);
+    ClusterStatus cluster = client.getClusterStatus();
+    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
+    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
+                                             1*1024*1024*1024);
+    if (numBytesToWritePerMap == 0) {
+      System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
+      return -2;
+    }
+    long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
+         numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
+    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
+    if (numMaps == 0 && totalBytesToWrite > 0) {
+      numMaps = 1;
+      conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
+    }
+    conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
+
+    Job job = new Job(conf);
+    
+    job.setJarByClass(RandomWriter.class);
+    job.setJobName("random-writer");
+    FileOutputFormat.setOutputPath(job, outDir);
+    job.setOutputKeyClass(BytesWritable.class);
+    job.setOutputValueClass(BytesWritable.class);
+    job.setInputFormatClass(RandomInputFormat.class);
+    job.setMapperClass(RandomMapper.class);        
+    job.setReducerClass(Reducer.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    
+    System.out.println("Running " + numMaps + " maps.");
+    
+    // reducer NONE
+    job.setNumReduceTasks(0);
+    
+    Date startTime = new Date();
+    System.out.println("Job started: " + startTime);
+    int ret = job.waitForCompletion(true) ? 0 : 1;
+    Date endTime = new Date();
+    System.out.println("Job ended: " + endTime);
+    System.out.println("The job took " + 
+                       (endTime.getTime() - startTime.getTime()) /1000 + 
+                       " seconds.");
+    
+    return ret;
+  }
+  
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new RandomWriter(), args);
+    System.exit(res);
+  }
+
+}

+ 111 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestFileInputFormat.java

@@ -19,7 +19,9 @@
 package org.apache.hadoop.mapreduce.lib.input;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
@@ -28,10 +30,15 @@ import static org.mockito.Mockito.*;
 import static org.apache.hadoop.test.MockitoMaker.*;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
 
 public class TestFileInputFormat {
 
@@ -80,4 +87,108 @@ public class TestFileInputFormat {
     ispy.getSplits(job);
     verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1);
   }
+  
+  @Test
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  public void testLastInputSplitAtSplitBoundary() throws Exception {
+    FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024,
+        128l * 1024 * 1024);
+    Configuration conf = new Configuration();
+    JobContext jobContext = mock(JobContext.class);
+    when(jobContext.getConfiguration()).thenReturn(conf);
+    List<InputSplit> splits = fif.getSplits(jobContext);
+    assertEquals(8, splits.size());
+    for (int i = 0 ; i < splits.size() ; i++) {
+      InputSplit split = splits.get(i);
+      assertEquals(("host" + i), split.getLocations()[0]);
+    }
+  }
+  
+  @Test
+  @SuppressWarnings({ "rawtypes", "unchecked" })
+  public void testLastInputSplitExceedingSplitBoundary() throws Exception {
+    FileInputFormat fif = new FileInputFormatForTest(1027l * 1024 * 1024,
+        128l * 1024 * 1024);
+    Configuration conf = new Configuration();
+    JobContext jobContext = mock(JobContext.class);
+    when(jobContext.getConfiguration()).thenReturn(conf);
+    List<InputSplit> splits = fif.getSplits(jobContext);
+    assertEquals(8, splits.size());
+    for (int i = 0; i < splits.size(); i++) {
+      InputSplit split = splits.get(i);
+      assertEquals(("host" + i), split.getLocations()[0]);
+    }
+  }
+
+  @Test
+  @SuppressWarnings({ "rawtypes", "unchecked" })
+  public void testLastInputSplitSingleSplit() throws Exception {
+    FileInputFormat fif = new FileInputFormatForTest(100l * 1024 * 1024,
+        128l * 1024 * 1024);
+    Configuration conf = new Configuration();
+    JobContext jobContext = mock(JobContext.class);
+    when(jobContext.getConfiguration()).thenReturn(conf);
+    List<InputSplit> splits = fif.getSplits(jobContext);
+    assertEquals(1, splits.size());
+    for (int i = 0; i < splits.size(); i++) {
+      InputSplit split = splits.get(i);
+      assertEquals(("host" + i), split.getLocations()[0]);
+    }
+  }
+
+  private class FileInputFormatForTest<K, V> extends FileInputFormat<K, V> {
+
+    long splitSize;
+    long length;
+
+    FileInputFormatForTest(long length, long splitSize) {
+      this.length = length;
+      this.splitSize = splitSize;
+    }
+
+    @Override
+    public RecordReader<K, V> createRecordReader(InputSplit split,
+        TaskAttemptContext context) throws IOException, InterruptedException {
+      return null;
+    }
+
+    @Override
+    protected List<FileStatus> listStatus(JobContext job) throws IOException {
+      FileStatus mockFileStatus = mock(FileStatus.class);
+      when(mockFileStatus.getBlockSize()).thenReturn(splitSize);
+      Path mockPath = mock(Path.class);
+      FileSystem mockFs = mock(FileSystem.class);
+
+      BlockLocation[] blockLocations = mockBlockLocations(length, splitSize);
+      when(mockFs.getFileBlockLocations(mockFileStatus, 0, length)).thenReturn(
+          blockLocations);
+      when(mockPath.getFileSystem(any(Configuration.class))).thenReturn(mockFs);
+
+      when(mockFileStatus.getPath()).thenReturn(mockPath);
+      when(mockFileStatus.getLen()).thenReturn(length);
+
+      List<FileStatus> list = new ArrayList<FileStatus>();
+      list.add(mockFileStatus);
+      return list;
+    }
+
+    @Override
+    protected long computeSplitSize(long blockSize, long minSize, long maxSize) {
+      return splitSize;
+    }
+
+    private BlockLocation[] mockBlockLocations(long size, long splitSize) {
+      int numLocations = (int) (size / splitSize);
+      if (size % splitSize != 0)
+        numLocations++;
+      BlockLocation[] blockLocations = new BlockLocation[numLocations];
+      for (int i = 0; i < numLocations; i++) {
+        String[] names = new String[] { "b" + i };
+        String[] hosts = new String[] { "host" + i };
+        blockLocations[i] = new BlockLocation(names, hosts, i * splitSize,
+            Math.min(splitSize, size - (splitSize * i)));
+      }
+      return blockLocations;
+    }
+  }
 }

+ 0 - 0
hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/test/MapredTestDriver.java → hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/test/MapredTestDriver.java


+ 2 - 2
hadoop-mapreduce-project/hadoop-mapreduce-examples/pom.xml

@@ -98,9 +98,9 @@
   <build>
    <plugins>
     <plugin>
-   	 <groupId>org.apache.maven.plugins</groupId>
+    <groupId>org.apache.maven.plugins</groupId>
      <artifactId>maven-jar-plugin</artifactId>
-  	  <configuration>
+      <configuration>
        <archive>
          <manifest>
            <mainClass>org.apache.hadoop.examples.ExampleDriver</mainClass>

+ 3 - 2
hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java

@@ -238,7 +238,9 @@ public class TeraGen extends Configured implements Tool {
 
     @Override
     public void cleanup(Context context) {
-      checksumCounter.increment(total.getLow8());
+      if (checksumCounter != null) {
+        checksumCounter.increment(total.getLow8());
+      }
     }
   }
 
@@ -307,5 +309,4 @@ public class TeraGen extends Configured implements Tool {
     int res = ToolRunner.run(new Configuration(), new TeraGen(), args);
     System.exit(res);
   }
-
 }

+ 1 - 15
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/ApplicationConstants.java

@@ -84,21 +84,7 @@ public interface ApplicationConstants {
   public static final String STDERR = "stderr";
 
   public static final String STDOUT = "stdout";
-  
-  /**
-   * Classpath for typical applications.
-   */
-  public static final String[] APPLICATION_CLASSPATH =
-      new String[] {
-        "$HADOOP_CONF_DIR",
-        "$HADOOP_COMMON_HOME/share/hadoop/common/*",
-        "$HADOOP_COMMON_HOME/share/hadoop/common/lib/*",
-        "$HADOOP_HDFS_HOME/share/hadoop/hdfs/*",
-        "$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*",
-        "$YARN_HOME/share/hadoop/mapreduce/*",
-        "$YARN_HOME/share/hadoop/mapreduce/lib/*"
-      };
-  
+
   /**
    * Environment for Applications.
    * 

+ 4 - 0
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

@@ -508,6 +508,10 @@ public class YarnConfiguration extends Configuration {
   public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS =
       2000;
 
+  /** Standard Hadoop classes */
+  public static final String YARN_APPLICATION_CLASSPATH = YARN_PREFIX
+      + "application.classpath";
+
   public YarnConfiguration() {
     super();
   }

+ 3 - 0
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApp.java

@@ -36,6 +36,7 @@ import com.google.common.collect.Lists;
 import com.google.inject.Provides;
 import com.google.inject.servlet.GuiceFilter;
 import com.google.inject.servlet.ServletModule;
+import com.sun.jersey.api.container.filter.GZIPContentEncodingFilter;
 import com.sun.jersey.api.core.ResourceConfig;
 import com.sun.jersey.core.util.FeaturesAndProperties;
 import com.sun.jersey.guice.spi.container.servlet.GuiceContainer;
@@ -160,6 +161,8 @@ public abstract class WebApp extends ServletModule {
       params.put(ResourceConfig.FEATURE_IMPLICIT_VIEWABLES, "true");
       params.put(ServletContainer.FEATURE_FILTER_FORWARD_ON_404, "true");
       params.put(FeaturesAndProperties.FEATURE_XMLROOTELEMENT_PROCESSING, "true");
+      params.put(ResourceConfig.PROPERTY_CONTAINER_REQUEST_FILTERS, GZIPContentEncodingFilter.class.getName());
+      params.put(ResourceConfig.PROPERTY_CONTAINER_RESPONSE_FILTERS, GZIPContentEncodingFilter.class.getName());
       filter("/*").through(GuiceContainer.class, params);
     }
 

+ 14 - 0
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/resources/yarn-default.xml

@@ -482,4 +482,18 @@
      <name>yarn.web-proxy.address</name>
      <value/>
   </property>
+
+  <property>
+    <description>Classpath for typical applications.</description>
+     <name>yarn.application.classpath</name>
+     <value>
+        $HADOOP_CONF_DIR,
+        $HADOOP_COMMON_HOME/share/hadoop/common/*,
+        $HADOOP_COMMON_HOME/share/hadoop/common/lib/*,
+        $HADOOP_HDFS_HOME/share/hadoop/hdfs/*,
+        $HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,
+        $YARN_HOME/share/hadoop/mapreduce/*,
+        $YARN_HOME/share/hadoop/mapreduce/lib/*
+     </value>
+  </property>
 </configuration>

+ 7 - 10
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApp.java

@@ -295,10 +295,6 @@ public class SchedulerApp {
     }
   }
 
-  public synchronized void setAvailableResourceLimit(Resource globalLimit) {
-    this.resourceLimit = globalLimit; 
-  }
-
   public synchronized RMContainer getRMContainer(ContainerId id) {
     return liveContainers.get(id);
   }
@@ -446,20 +442,21 @@ public class SchedulerApp {
     return reservedContainers;
   }
   
+  public synchronized void setHeadroom(Resource globalLimit) {
+    this.resourceLimit = globalLimit; 
+  }
+
   /**
    * Get available headroom in terms of resources for the application's user.
    * @return available resource headroom
    */
   public synchronized Resource getHeadroom() {
-    Resource limit = Resources.subtract(resourceLimit, currentConsumption);
-    Resources.subtractFrom(limit, currentReservation);
-
     // Corner case to deal with applications being slightly over-limit
-    if (limit.getMemory() < 0) {
-      limit.setMemory(0);
+    if (resourceLimit.getMemory() < 0) {
+      resourceLimit.setMemory(0);
     }
     
-    return limit;
+    return resourceLimit;
   }
 
   public Queue getQueue() {

+ 29 - 1
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueUtils.java

@@ -17,12 +17,19 @@
 */
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
 
+import org.apache.hadoop.yarn.api.records.Resource;
+
 class CSQueueUtils {
   
   public static void checkMaxCapacity(String queueName, 
       float capacity, float maximumCapacity) {
-    if (Math.round(100 * maximumCapacity) != CapacitySchedulerConfiguration.UNDEFINED && 
+    if (maximumCapacity < 0.0f || maximumCapacity > 1.0f || 
         maximumCapacity < capacity) {
+      throw new IllegalArgumentException(
+          "Illegal value  of maximumCapacity " + maximumCapacity + 
+          " used in call to setMaxCapacity for queue " + queueName);
+    }
+    if (maximumCapacity < capacity) {
       throw new IllegalArgumentException(
           "Illegal call to setMaxCapacity. " +
           "Queue '" + queueName + "' has " +
@@ -30,5 +37,26 @@ class CSQueueUtils {
           "maximumCapacity (" + maximumCapacity + ")" );
     }
   }
+
+  public static float computeAbsoluteMaximumCapacity(
+      float maximumCapacity, CSQueue parent) {
+    float parentAbsMaxCapacity = 
+        (parent == null) ? 1.0f : parent.getAbsoluteMaximumCapacity();
+    return (parentAbsMaxCapacity * maximumCapacity);
+  }
+
+  public static int computeMaxActiveApplications(Resource clusterResource,
+      float maxAMResourcePercent, float absoluteCapacity) {
+    return 
+        Math.max(
+            (int)((clusterResource.getMemory() / (float)LeafQueue.DEFAULT_AM_RESOURCE) * 
+                   maxAMResourcePercent * absoluteCapacity), 
+            1);
+  }
+
+  public static int computeMaxActiveApplicationsPerUser(
+      int maxActiveApplications, int userLimit, float userLimitFactor) {
+    return (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor);
+  }
   
 }

+ 6 - 2
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java

@@ -149,7 +149,7 @@ public class CapacitySchedulerConfiguration extends Configuration {
       throw new IllegalArgumentException("Illegal " +
       		"capacity of " + capacity + " for queue " + queue);
     }
-    LOG.debug("CSConf - setCapacity: queuePrefix=" + getQueuePrefix(queue) + 
+    LOG.debug("CSConf - getCapacity: queuePrefix=" + getQueuePrefix(queue) + 
         ", capacity=" + capacity);
     return capacity;
   }
@@ -162,11 +162,15 @@ public class CapacitySchedulerConfiguration extends Configuration {
 
   public int getMaximumCapacity(String queue) {
     int maxCapacity = 
-      getInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, UNDEFINED);
+      getInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, MAXIMUM_CAPACITY_VALUE);
     return maxCapacity;
   }
   
   public void setMaximumCapacity(String queue, int maxCapacity) {
+    if (maxCapacity > MAXIMUM_CAPACITY_VALUE) {
+      throw new IllegalArgumentException("Illegal " +
+          "maximum-capacity of " + maxCapacity + " for queue " + queue);
+    }
     setInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, maxCapacity);
     LOG.debug("CSConf - setMaxCapacity: queuePrefix=" + getQueuePrefix(queue) + 
         ", maxCapacity=" + maxCapacity);

+ 59 - 61
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java

@@ -144,10 +144,10 @@ public class LeafQueue implements CSQueue {
       (float)cs.getConfiguration().getCapacity(getQueuePath()) / 100;
     float absoluteCapacity = parent.getAbsoluteCapacity() * capacity;
 
-    float maximumCapacity = (float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
+    float maximumCapacity = 
+        (float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
     float absoluteMaxCapacity = 
-      (Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ? 
-          Float.MAX_VALUE : (parent.getAbsoluteCapacity() * maximumCapacity);
+        CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
 
     int userLimit = cs.getConfiguration().getUserLimit(getQueuePath());
     float userLimitFactor = 
@@ -161,10 +161,10 @@ public class LeafQueue implements CSQueue {
     this.maxAMResourcePercent = 
         cs.getConfiguration().getMaximumApplicationMasterResourcePercent();
     int maxActiveApplications = 
-        computeMaxActiveApplications(cs.getClusterResources(), 
+        CSQueueUtils.computeMaxActiveApplications(cs.getClusterResources(), 
             maxAMResourcePercent, absoluteCapacity);
     int maxActiveApplicationsPerUser = 
-        computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit, 
+        CSQueueUtils.computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit, 
             userLimitFactor);
 
     this.queueInfo = recordFactory.newRecordInstance(QueueInfo.class);
@@ -193,20 +193,6 @@ public class LeafQueue implements CSQueue {
     this.activeApplications = new TreeSet<SchedulerApp>(applicationComparator);
   }
 
-  private int computeMaxActiveApplications(Resource clusterResource,
-      float maxAMResourcePercent, float absoluteCapacity) {
-    return 
-        Math.max(
-            (int)((clusterResource.getMemory() / (float)DEFAULT_AM_RESOURCE) * 
-                   maxAMResourcePercent * absoluteCapacity), 
-            1);
-  }
-  
-  private int computeMaxActiveApplicationsPerUser(int maxActiveApplications, 
-      int userLimit, float userLimitFactor) {
-    return (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor);
-  }
-  
   private synchronized void setupQueueConfigs(
       float capacity, float absoluteCapacity, 
       float maximumCapacity, float absoluteMaxCapacity,
@@ -254,8 +240,8 @@ public class LeafQueue implements CSQueue {
         "maxCapacity = " + maximumCapacity +
         " [= configuredMaxCapacity ]" + "\n" +
         "absoluteMaxCapacity = " + absoluteMaxCapacity +
-        " [= Float.MAX_VALUE if maximumCapacity undefined, " +
-        "(parentAbsoluteCapacity * maximumCapacity) / 100 otherwise ]" + "\n" +
+        " [= 1.0 maximumCapacity undefined, " +
+        "(parentAbsoluteMaxCapacity * maximumCapacity) / 100 otherwise ]" + "\n" +
         "userLimit = " + userLimit +
         " [= configuredUserLimit ]" + "\n" +
         "userLimitFactor = " + userLimitFactor +
@@ -272,9 +258,9 @@ public class LeafQueue implements CSQueue {
         "maxActiveApplicationsPerUser = " + maxActiveApplicationsPerUser +
         " [= (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor) ]" + "\n" +
         "utilization = " + utilization +
-        " [= usedResourcesMemory / queueLimit ]" + "\n" +
+        " [= usedResourcesMemory /  (clusterResourceMemory * absoluteCapacity)]" + "\n" +
         "usedCapacity = " + usedCapacity +
-        " [= usedResourcesMemory / (clusterResourceMemory * capacity) ]" + "\n" +
+        " [= usedResourcesMemory / (clusterResourceMemory * parent.absoluteCapacity)]" + "\n" +
         "maxAMResourcePercent = " + maxAMResourcePercent +
         " [= configuredMaximumAMResourcePercent ]" + "\n" +
         "minimumAllocationFactor = " + minimumAllocationFactor +
@@ -400,9 +386,7 @@ public class LeafQueue implements CSQueue {
     
     this.maximumCapacity = maximumCapacity;
     this.absoluteMaxCapacity = 
-      (Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ? 
-          Float.MAX_VALUE : 
-          (parent.getAbsoluteCapacity() * maximumCapacity);
+        CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
   }
   
   /**
@@ -502,9 +486,14 @@ public class LeafQueue implements CSQueue {
   }
 
   public String toString() {
-    return queueName + ":" + capacity + ":" + absoluteCapacity + ":" + 
-    getUsedCapacity() + ":" + getUtilization() + ":" + 
-    getNumApplications() + ":" + getNumContainers();
+    return queueName + ": " + 
+        "capacity=" + capacity + ", " + 
+        "absoluteCapacity=" + absoluteCapacity + ", " + 
+        "usedResources=" + usedResources.getMemory() + "MB, " + 
+        "usedCapacity=" + getUsedCapacity() + ", " + 
+        "utilization=" + getUtilization() + ", " + 
+        "numApps=" + getNumApplications() + ", " + 
+        "numContainers=" + getNumContainers();  
   }
 
   private synchronized User getUser(String userName) {
@@ -731,12 +720,11 @@ public class LeafQueue implements CSQueue {
       if(LOG.isDebugEnabled()) {
         LOG.debug("pre-assignContainers for application "
         + application.getApplicationId());
+        application.showRequests();
       }
-      application.showRequests();
 
       synchronized (application) {
-        computeAndSetUserResourceLimit(application, clusterResource);
-        
+        // Schedule in priority order
         for (Priority priority : application.getPriorities()) {
           // Required resource
           Resource required = 
@@ -747,15 +735,21 @@ public class LeafQueue implements CSQueue {
             continue;
           }
 
-          // Are we going over limits by allocating to this application?
-          // Maximum Capacity of the queue
+          // Compute & set headroom
+          // Note: We set the headroom with the highest priority request 
+          //       as the target. 
+          //       This works since we never assign lower priority requests
+          //       before all higher priority ones are serviced.
+          Resource userLimit = 
+              computeAndSetUserResourceLimit(application, clusterResource, 
+                  required);
+
+          // Check queue max-capacity limit
           if (!assignToQueue(clusterResource, required)) {
             return NULL_ASSIGNMENT;
           }
 
-          // User limits
-          Resource userLimit = 
-            computeUserLimit(application, clusterResource, required); 
+          // Check user limit
           if (!assignToUser(application.getUser(), userLimit)) {
             break; 
           }
@@ -769,7 +763,7 @@ public class LeafQueue implements CSQueue {
                 null);
           
           Resource assigned = assignment.getResource();
-            
+          
           // Did we schedule or reserve a container?
           if (Resources.greaterThan(assigned, Resources.none())) {
 
@@ -830,25 +824,28 @@ public class LeafQueue implements CSQueue {
     float potentialNewCapacity = 
       (float)(usedResources.getMemory() + required.getMemory()) / 
         clusterResource.getMemory();
-    LOG.info(getQueueName() + 
-        " usedResources: " + usedResources.getMemory() + 
-        " currentCapacity " + ((float)usedResources.getMemory())/clusterResource.getMemory() + 
-        " required " + required.getMemory() +
-        " potentialNewCapacity: " + potentialNewCapacity + " ( " +
-        " max-capacity: " + absoluteMaxCapacity + ")");
     if (potentialNewCapacity > absoluteMaxCapacity) {
+      LOG.info(getQueueName() + 
+          " usedResources: " + usedResources.getMemory() +
+          " clusterResources: " + clusterResource.getMemory() +
+          " currentCapacity " + ((float)usedResources.getMemory())/clusterResource.getMemory() + 
+          " required " + required.getMemory() +
+          " potentialNewCapacity: " + potentialNewCapacity + " ( " +
+          " max-capacity: " + absoluteMaxCapacity + ")");
       return false;
     }
     return true;
   }
 
-  private void computeAndSetUserResourceLimit(SchedulerApp application, 
-      Resource clusterResource) {
-    Resource userLimit = 
-        computeUserLimit(application, clusterResource, Resources.none());
-    application.setAvailableResourceLimit(userLimit);
-    metrics.setAvailableResourcesToUser(application.getUser(), 
-        application.getHeadroom());
+  private Resource computeAndSetUserResourceLimit(SchedulerApp application, 
+      Resource clusterResource, Resource required) {
+    String user = application.getUser();
+    Resource limit = computeUserLimit(application, clusterResource, required);
+    Resource headroom = 
+        Resources.subtract(limit, getUser(user).getConsumedResources());
+    application.setHeadroom(headroom);
+    metrics.setAvailableResourcesToUser(user, headroom);
+    return limit;
   }
   
   private int roundUp(int memory) {
@@ -919,7 +916,7 @@ public class LeafQueue implements CSQueue {
     User user = getUser(userName);
     
     // Note: We aren't considering the current request since there is a fixed
-    // overhead of the AM, but it's a >= check, so... 
+    // overhead of the AM, but it's a > check, not a >= check, so... 
     if ((user.getConsumedResources().getMemory()) > limit.getMemory()) {
       if (LOG.isDebugEnabled()) {
         LOG.debug("User " + userName + " in queue " + getQueueName() + 
@@ -1237,8 +1234,8 @@ public class LeafQueue implements CSQueue {
         // happen under scheduler's lock... 
         // So, this is, in effect, a transaction across application & node
         if (rmContainer.getState() == RMContainerState.RESERVED) {
-          application.unreserve(node, rmContainer.getReservedPriority());
-          node.unreserveResource(application);
+          unreserve(application, rmContainer.getReservedPriority(), 
+              node, rmContainer);
         } else {
           application.containerCompleted(rmContainer, containerStatus, event);
           node.releaseContainer(container);
@@ -1303,24 +1300,25 @@ public class LeafQueue implements CSQueue {
   public synchronized void updateClusterResource(Resource clusterResource) {
     // Update queue properties
     maxActiveApplications = 
-        computeMaxActiveApplications(clusterResource, maxAMResourcePercent, 
+        CSQueueUtils.computeMaxActiveApplications(clusterResource, maxAMResourcePercent, 
             absoluteCapacity);
     maxActiveApplicationsPerUser = 
-        computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit, 
+        CSQueueUtils.computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit, 
             userLimitFactor);
     
     // Update application properties
     for (SchedulerApp application : activeApplications) {
-      computeAndSetUserResourceLimit(application, clusterResource);
+      computeAndSetUserResourceLimit(
+          application, clusterResource, Resources.none());
     }
   }
   
   private synchronized void updateResource(Resource clusterResource) {
-    float queueLimit = clusterResource.getMemory() * absoluteCapacity; 
+    float queueLimit = clusterResource.getMemory() * absoluteCapacity;
     setUtilization(usedResources.getMemory() / queueLimit);
-    setUsedCapacity(
-        usedResources.getMemory() / (clusterResource.getMemory() * capacity));
-    
+    setUsedCapacity(usedResources.getMemory()
+        / (clusterResource.getMemory() * parent.getAbsoluteCapacity()));
+
     Resource resourceLimit = 
       Resources.createResource(roundUp((int)queueLimit));
     metrics.setAvailableResourcesToQueue(

+ 19 - 18
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java

@@ -118,16 +118,14 @@ public class ParentQueue implements CSQueue {
     }
 
     float capacity = (float) rawCapacity / 100;
-
     float parentAbsoluteCapacity = 
-      (parent == null) ? 1.0f : parent.getAbsoluteCapacity();
+      (rootQueue) ? 1.0f : parent.getAbsoluteCapacity();
     float absoluteCapacity = parentAbsoluteCapacity * capacity; 
 
-    float maximumCapacity = 
+    float  maximumCapacity =
       (float) cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
     float absoluteMaxCapacity = 
-      (Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ? 
-          Float.MAX_VALUE :  (parentAbsoluteCapacity * maximumCapacity);
+          CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
     
     QueueState state = cs.getConfiguration().getState(getQueuePath());
 
@@ -333,10 +331,15 @@ public class ParentQueue implements CSQueue {
   }
 
   public String toString() {
-    return queueName + ":" + capacity + ":" + absoluteCapacity + ":" + 
-      getUsedCapacity() + ":" + getUtilization() + ":" + 
-      getNumApplications() + ":" + getNumContainers() + ":" + 
-      childQueues.size() + " child-queues";
+    return queueName + ": " +
+        "numChildQueue= " + childQueues.size() + ", " + 
+        "capacity=" + capacity + ", " +  
+        "absoluteCapacity=" + absoluteCapacity + ", " +
+        "usedResources=" + usedResources.getMemory() + "MB, " + 
+        "usedCapacity=" + getUsedCapacity() + ", " + 
+        "utilization=" + getUtilization() + ", " +
+        "numApps=" + getNumApplications() + ", " + 
+        "numContainers=" + getNumContainers();
   }
   
   @Override
@@ -492,12 +495,8 @@ public class ParentQueue implements CSQueue {
     CSQueueUtils.checkMaxCapacity(getQueueName(), capacity, maximumCapacity);
     
     this.maximumCapacity = maximumCapacity;
-    float parentAbsoluteCapacity = 
-        (rootQueue) ? 100.0f : parent.getAbsoluteCapacity();
     this.absoluteMaxCapacity = 
-      (maximumCapacity == CapacitySchedulerConfiguration.UNDEFINED) ? 
-          Float.MAX_VALUE : 
-          (parentAbsoluteCapacity * maximumCapacity);
+        CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
   }
 
   @Override
@@ -688,11 +687,13 @@ public class ParentQueue implements CSQueue {
   }
   
   private synchronized void updateResource(Resource clusterResource) {
-    float queueLimit = clusterResource.getMemory() * absoluteCapacity; 
+    float queueLimit = clusterResource.getMemory() * absoluteCapacity;
+    float parentAbsoluteCapacity = 
+        (rootQueue) ? 1.0f : parent.getAbsoluteCapacity();
     setUtilization(usedResources.getMemory() / queueLimit);
-    setUsedCapacity(
-        usedResources.getMemory() / (clusterResource.getMemory() * capacity));
-    
+    setUsedCapacity(usedResources.getMemory() 
+        / (clusterResource.getMemory() * parentAbsoluteCapacity));
+  
     Resource resourceLimit = 
       Resources.createResource((int)queueLimit);
     metrics.setAvailableResourcesToQueue(

+ 1 - 1
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java

@@ -358,7 +358,7 @@ public class FifoScheduler implements ResourceScheduler {
         }
       }
       
-      application.setAvailableResourceLimit(clusterResource);
+      application.setHeadroom(clusterResource);
       
       LOG.debug("post-assignContainers");
       application.showRequests();

+ 62 - 16
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java

@@ -21,16 +21,24 @@ import static org.junit.Assert.*;
 import static org.mockito.Mockito.*;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.Priority;
 import org.apache.hadoop.yarn.api.records.QueueACL;
 import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceRequest;
+import org.apache.hadoop.yarn.factories.RecordFactory;
+import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
+import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
+import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
 import org.junit.After;
@@ -283,38 +291,76 @@ public class TestApplicationLimits {
     final String user_0 = "user_0";
     final String user_1 = "user_1";
     
-    int APPLICATION_ID = 0;
+    RecordFactory recordFactory = 
+        RecordFactoryProvider.getRecordFactory(null);
+    RMContext rmContext = TestUtils.getMockRMContext();
+
+    Priority priority_1 = TestUtils.createMockPriority(1);
 
-    // Submit first application from user_0, check headroom
-    SchedulerApp app_0_0 = getMockApplication(APPLICATION_ID++, user_0);
+    // Submit first application with some resource-requests from user_0, 
+    // and check headroom
+    final ApplicationAttemptId appAttemptId_0_0 = 
+        TestUtils.getMockApplicationAttemptId(0, 0); 
+    SchedulerApp app_0_0 = 
+        spy(new SchedulerApp(appAttemptId_0_0, user_0, queue, rmContext, null));
     queue.submitApplication(app_0_0, user_0, A);
-    queue.assignContainers(clusterResource, node_0); // Schedule to compute
+
+    List<ResourceRequest> app_0_0_requests = new ArrayList<ResourceRequest>();
+    app_0_0_requests.add(
+        TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2, 
+            priority_1, recordFactory));
+    app_0_0.updateResourceRequests(app_0_0_requests);
+
+    // Schedule to compute 
+    queue.assignContainers(clusterResource, node_0);
     Resource expectedHeadroom = Resources.createResource(10*16*GB);
-    verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
+    verify(app_0_0).setHeadroom(eq(expectedHeadroom));
 
     // Submit second application from user_0, check headroom
-    SchedulerApp app_0_1 = getMockApplication(APPLICATION_ID++, user_0);
+    final ApplicationAttemptId appAttemptId_0_1 = 
+        TestUtils.getMockApplicationAttemptId(1, 0); 
+    SchedulerApp app_0_1 = 
+        spy(new SchedulerApp(appAttemptId_0_1, user_0, queue, rmContext, null));
     queue.submitApplication(app_0_1, user_0, A);
+    
+    List<ResourceRequest> app_0_1_requests = new ArrayList<ResourceRequest>();
+    app_0_1_requests.add(
+        TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2, 
+            priority_1, recordFactory));
+    app_0_1.updateResourceRequests(app_0_1_requests);
+
+    // Schedule to compute 
     queue.assignContainers(clusterResource, node_0); // Schedule to compute
-    verify(app_0_0, times(2)).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));// no change
+    verify(app_0_0, times(2)).setHeadroom(eq(expectedHeadroom));
+    verify(app_0_1).setHeadroom(eq(expectedHeadroom));// no change
     
     // Submit first application from user_1, check  for new headroom
-    SchedulerApp app_1_0 = getMockApplication(APPLICATION_ID++, user_1);
+    final ApplicationAttemptId appAttemptId_1_0 = 
+        TestUtils.getMockApplicationAttemptId(2, 0); 
+    SchedulerApp app_1_0 = 
+        spy(new SchedulerApp(appAttemptId_1_0, user_1, queue, rmContext, null));
     queue.submitApplication(app_1_0, user_1, A);
+
+    List<ResourceRequest> app_1_0_requests = new ArrayList<ResourceRequest>();
+    app_1_0_requests.add(
+        TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2, 
+            priority_1, recordFactory));
+    app_1_0.updateResourceRequests(app_1_0_requests);
+    
+    // Schedule to compute 
     queue.assignContainers(clusterResource, node_0); // Schedule to compute
     expectedHeadroom = Resources.createResource(10*16*GB / 2); // changes
-    verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_1_0).setAvailableResourceLimit(eq(expectedHeadroom));
-    
+    verify(app_0_0).setHeadroom(eq(expectedHeadroom));
+    verify(app_0_1).setHeadroom(eq(expectedHeadroom));
+    verify(app_1_0).setHeadroom(eq(expectedHeadroom));
+
     // Now reduce cluster size and check for the smaller headroom
     clusterResource = Resources.createResource(90*16*GB);
     queue.assignContainers(clusterResource, node_0); // Schedule to compute
     expectedHeadroom = Resources.createResource(9*16*GB / 2); // changes
-    verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));
-    verify(app_1_0).setAvailableResourceLimit(eq(expectedHeadroom));
+    verify(app_0_0).setHeadroom(eq(expectedHeadroom));
+    verify(app_0_1).setHeadroom(eq(expectedHeadroom));
+    verify(app_1_0).setHeadroom(eq(expectedHeadroom));
   }
   
 

+ 7 - 7
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java

@@ -255,7 +255,7 @@ public class TestLeafQueue {
     // Manipulate queue 'a'
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     //unset maxCapacity
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
 
     // Users
     final String user_0 = "user_0";
@@ -377,7 +377,7 @@ public class TestLeafQueue {
     // Mock the queue
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     //unset maxCapacity
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
     
     // Users
     final String user_0 = "user_0";
@@ -491,7 +491,7 @@ public class TestLeafQueue {
     
     // Revert max-capacity and user-limit-factor
     // Now, allocations should goto app_3 since it's under user-limit 
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
     a.setUserLimitFactor(1);
     a.assignContainers(clusterResource, node_0);
     assertEquals(7*GB, a.getUsedResources().getMemory()); 
@@ -548,7 +548,7 @@ public class TestLeafQueue {
     // Manipulate queue 'a'
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     //unset maxCapacity
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
 
     // Users
     final String user_0 = "user_0";
@@ -571,7 +571,7 @@ public class TestLeafQueue {
     String host_0 = "host_0";
     SchedulerNode node_0 = TestUtils.getMockNode(host_0, DEFAULT_RACK, 0, 4*GB);
     
-    final int numNodes = 1;
+    final int numNodes = 2;
     Resource clusterResource = Resources.createResource(numNodes * (4*GB));
     when(csContext.getNumClusterNodes()).thenReturn(numNodes);
     
@@ -646,7 +646,7 @@ public class TestLeafQueue {
     // Manipulate queue 'a'
     LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
     //unset maxCapacity
-    a.setMaxCapacity(-0.01f);
+    a.setMaxCapacity(1.0f);
     a.setUserLimitFactor(10);
 
     // Users
@@ -673,7 +673,7 @@ public class TestLeafQueue {
     String host_1 = "host_1";
     SchedulerNode node_1 = TestUtils.getMockNode(host_1, DEFAULT_RACK, 0, 4*GB);
     
-    final int numNodes = 2;
+    final int numNodes = 3;
     Resource clusterResource = Resources.createResource(numNodes * (4*GB));
     when(csContext.getNumClusterNodes()).thenReturn(numNodes);
     when(csContext.getMaximumResourceCapability()).thenReturn(

+ 52 - 62
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestParentQueue.java

@@ -138,12 +138,34 @@ public class TestParentQueue {
     when(queue).assignContainers(eq(clusterResource), eq(node));
   }
   
+  private float computeQueueUsedCapacity(CSQueue queue, 
+      int expectedMemory, Resource clusterResource) {
+    return (
+        ((float)expectedMemory / clusterResource.getMemory()) *
+        queue.getParent().getAbsoluteCapacity()
+      );
+  }
+  
   private float computeQueueUtilization(CSQueue queue, 
       int expectedMemory, Resource clusterResource) {
     return (expectedMemory / 
         (clusterResource.getMemory() * queue.getAbsoluteCapacity()));
   }
   
+  final static float DELTA = 0.0001f;
+  private void verifyQueueMetrics(CSQueue queue, 
+      int expectedMemory, Resource clusterResource) {
+    assertEquals(
+        computeQueueUtilization(queue, expectedMemory, clusterResource), 
+        queue.getUtilization(), 
+        DELTA);
+    assertEquals(
+        computeQueueUsedCapacity(queue, expectedMemory, clusterResource), 
+        queue.getUsedCapacity(), 
+        DELTA);
+
+  }
+  
   @Test
   public void testSingleLevelQueues() throws Exception {
     // Setup queue configs
@@ -173,15 +195,13 @@ public class TestParentQueue {
     // Start testing
     LeafQueue a = (LeafQueue)queues.get(A);
     LeafQueue b = (LeafQueue)queues.get(B);
-    final float delta = 0.0001f;
     
     // Simulate B returning a container on node_0
     stubQueueAllocation(a, clusterResource, node_0, 0*GB);
     stubQueueAllocation(b, clusterResource, node_0, 1*GB);
     root.assignContainers(clusterResource, node_0);
-    assertEquals(0.0f, a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 1*GB, clusterResource), 
-        b.getUtilization(), delta);
+    verifyQueueMetrics(a, 0*GB, clusterResource);
+    verifyQueueMetrics(b, 1*GB, clusterResource);
     
     // Now, A should get the scheduling opportunity since A=0G/6G, B=1G/14G
     stubQueueAllocation(a, clusterResource, node_1, 2*GB);
@@ -192,10 +212,8 @@ public class TestParentQueue {
         any(SchedulerNode.class));
     allocationOrder.verify(b).assignContainers(eq(clusterResource), 
         any(SchedulerNode.class));
-    assertEquals(computeQueueUtilization(a, 2*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 2*GB, clusterResource), 
-        b.getUtilization(), delta);
+    verifyQueueMetrics(a, 2*GB, clusterResource);
+    verifyQueueMetrics(b, 2*GB, clusterResource);
 
     // Now, B should get the scheduling opportunity 
     // since A has 2/6G while B has 2/14G
@@ -207,10 +225,8 @@ public class TestParentQueue {
         any(SchedulerNode.class));
     allocationOrder.verify(a).assignContainers(eq(clusterResource), 
         any(SchedulerNode.class));
-    assertEquals(computeQueueUtilization(a, 3*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 4*GB, clusterResource), 
-        b.getUtilization(), delta);
+    verifyQueueMetrics(a, 3*GB, clusterResource);
+    verifyQueueMetrics(b, 4*GB, clusterResource);
 
     // Now, B should still get the scheduling opportunity 
     // since A has 3/6G while B has 4/14G
@@ -222,10 +238,8 @@ public class TestParentQueue {
         any(SchedulerNode.class));
     allocationOrder.verify(a).assignContainers(eq(clusterResource), 
         any(SchedulerNode.class));
-    assertEquals(computeQueueUtilization(a, 3*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 8*GB, clusterResource), 
-        b.getUtilization(), delta);
+    verifyQueueMetrics(a, 3*GB, clusterResource);
+    verifyQueueMetrics(b, 8*GB, clusterResource);
 
     // Now, A should get the scheduling opportunity 
     // since A has 3/6G while B has 8/14G
@@ -237,10 +251,8 @@ public class TestParentQueue {
         any(SchedulerNode.class));
     allocationOrder.verify(a).assignContainers(eq(clusterResource), 
         any(SchedulerNode.class));
-    assertEquals(computeQueueUtilization(a, 4*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 9*GB, clusterResource), 
-        b.getUtilization(), delta);
+    verifyQueueMetrics(a, 4*GB, clusterResource);
+    verifyQueueMetrics(b, 9*GB, clusterResource);
   }
 
   private static final String C = "c";
@@ -323,22 +335,16 @@ public class TestParentQueue {
     CSQueue b2 = queues.get(B2);
     CSQueue b3 = queues.get(B3);
 
-    final float delta = 0.0001f;
-    
     // Simulate C returning a container on node_0
     stubQueueAllocation(a, clusterResource, node_0, 0*GB);
     stubQueueAllocation(b, clusterResource, node_0, 0*GB);
     stubQueueAllocation(c, clusterResource, node_0, 1*GB);
     stubQueueAllocation(d, clusterResource, node_0, 0*GB);
     root.assignContainers(clusterResource, node_0);
-    assertEquals(computeQueueUtilization(a, 0*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 0*GB, clusterResource), 
-        b.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(c, 1*GB, clusterResource), 
-        c.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(d, 0*GB, clusterResource), 
-        d.getUtilization(), delta);
+    verifyQueueMetrics(a, 0*GB, clusterResource);
+    verifyQueueMetrics(b, 0*GB, clusterResource);
+    verifyQueueMetrics(c, 1*GB, clusterResource);
+    verifyQueueMetrics(d, 0*GB, clusterResource);
     reset(a); reset(b); reset(c);
 
     // Now get B2 to allocate
@@ -347,12 +353,9 @@ public class TestParentQueue {
     stubQueueAllocation(b2, clusterResource, node_1, 4*GB);
     stubQueueAllocation(c, clusterResource, node_1, 0*GB);
     root.assignContainers(clusterResource, node_1);
-    assertEquals(computeQueueUtilization(a, 0*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 4*GB, clusterResource), 
-        b.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(c, 1*GB, clusterResource), 
-        c.getUtilization(), delta);
+    verifyQueueMetrics(a, 0*GB, clusterResource);
+    verifyQueueMetrics(b, 4*GB, clusterResource);
+    verifyQueueMetrics(c, 1*GB, clusterResource);
     reset(a); reset(b); reset(c);
     
     // Now get both A1, C & B3 to allocate in right order
@@ -368,12 +371,9 @@ public class TestParentQueue {
         any(SchedulerNode.class));
     allocationOrder.verify(b).assignContainers(eq(clusterResource), 
         any(SchedulerNode.class));
-    assertEquals(computeQueueUtilization(a, 1*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 6*GB, clusterResource), 
-        b.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(c, 3*GB, clusterResource), 
-        c.getUtilization(), delta);
+    verifyQueueMetrics(a, 1*GB, clusterResource);
+    verifyQueueMetrics(b, 6*GB, clusterResource);
+    verifyQueueMetrics(c, 3*GB, clusterResource);
     reset(a); reset(b); reset(c);
     
     // Now verify max-capacity
@@ -399,16 +399,12 @@ public class TestParentQueue {
         any(SchedulerNode.class));
     allocationOrder.verify(c).assignContainers(eq(clusterResource), 
         any(SchedulerNode.class));
-    assertEquals(computeQueueUtilization(a, 3*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 8*GB, clusterResource), 
-        b.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(c, 4*GB, clusterResource), 
-        c.getUtilization(), delta);
+    verifyQueueMetrics(a, 3*GB, clusterResource);
+    verifyQueueMetrics(b, 8*GB, clusterResource);
+    verifyQueueMetrics(c, 4*GB, clusterResource);
     reset(a); reset(b); reset(c);
-    
   }
-  
+
   @Test
   public void testOffSwitchScheduling() throws Exception {
     // Setup queue configs
@@ -438,15 +434,13 @@ public class TestParentQueue {
     // Start testing
     LeafQueue a = (LeafQueue)queues.get(A);
     LeafQueue b = (LeafQueue)queues.get(B);
-    final float delta = 0.0001f;
     
     // Simulate B returning a container on node_0
     stubQueueAllocation(a, clusterResource, node_0, 0*GB, NodeType.OFF_SWITCH);
     stubQueueAllocation(b, clusterResource, node_0, 1*GB, NodeType.OFF_SWITCH);
     root.assignContainers(clusterResource, node_0);
-    assertEquals(0.0f, a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 1*GB, clusterResource), 
-        b.getUtilization(), delta);
+    verifyQueueMetrics(a, 0*GB, clusterResource);
+    verifyQueueMetrics(b, 1*GB, clusterResource);
     
     // Now, A should get the scheduling opportunity since A=0G/6G, B=1G/14G
     // also, B gets a scheduling opportunity since A allocates RACK_LOCAL
@@ -458,10 +452,8 @@ public class TestParentQueue {
         any(SchedulerNode.class));
     allocationOrder.verify(b).assignContainers(eq(clusterResource), 
         any(SchedulerNode.class));
-    assertEquals(computeQueueUtilization(a, 2*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 2*GB, clusterResource), 
-        b.getUtilization(), delta);
+    verifyQueueMetrics(a, 2*GB, clusterResource);
+    verifyQueueMetrics(b, 2*GB, clusterResource);
     
     // Now, B should get the scheduling opportunity 
     // since A has 2/6G while B has 2/14G, 
@@ -474,10 +466,8 @@ public class TestParentQueue {
         any(SchedulerNode.class));
     allocationOrder.verify(a).assignContainers(eq(clusterResource), 
         any(SchedulerNode.class));
-    assertEquals(computeQueueUtilization(a, 2*GB, clusterResource), 
-        a.getUtilization(), delta);
-    assertEquals(computeQueueUtilization(b, 4*GB, clusterResource), 
-        b.getUtilization(), delta);
+    verifyQueueMetrics(a, 2*GB, clusterResource);
+    verifyQueueMetrics(b, 4*GB, clusterResource);
 
   }
   

+ 31 - 1
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestQueueParsing.java

@@ -30,6 +30,8 @@ public class TestQueueParsing {
 
   private static final Log LOG = LogFactory.getLog(TestQueueParsing.class);
   
+  private static final double DELTA = 0.000001;
+  
   @Test
   public void testQueueParsing() throws Exception {
     CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration();
@@ -37,6 +39,20 @@ public class TestQueueParsing {
 
     CapacityScheduler capacityScheduler = new CapacityScheduler();
     capacityScheduler.reinitialize(conf, null, null);
+    
+    CSQueue a = capacityScheduler.getQueue("a");
+    Assert.assertEquals(0.10, a.getAbsoluteCapacity(), DELTA);
+    Assert.assertEquals(0.15, a.getAbsoluteMaximumCapacity(), DELTA);
+    
+    CSQueue b1 = capacityScheduler.getQueue("b1");
+    Assert.assertEquals(0.2 * 0.5, b1.getAbsoluteCapacity(), DELTA);
+    Assert.assertEquals("Parent B has no MAX_CAP", 
+        0.85, b1.getAbsoluteMaximumCapacity(), DELTA);
+    
+    CSQueue c12 = capacityScheduler.getQueue("c12");
+    Assert.assertEquals(0.7 * 0.5 * 0.45, c12.getAbsoluteCapacity(), DELTA);
+    Assert.assertEquals(0.7 * 0.55 * 0.7, 
+        c12.getAbsoluteMaximumCapacity(), DELTA);
   }
   
   private void setupQueueConfiguration(CapacitySchedulerConfiguration conf) {
@@ -47,12 +63,14 @@ public class TestQueueParsing {
     
     final String A = CapacitySchedulerConfiguration.ROOT + ".a";
     conf.setCapacity(A, 10);
+    conf.setMaximumCapacity(A, 15);
     
     final String B = CapacitySchedulerConfiguration.ROOT + ".b";
     conf.setCapacity(B, 20);
-
+    
     final String C = CapacitySchedulerConfiguration.ROOT + ".c";
     conf.setCapacity(C, 70);
+    conf.setMaximumCapacity(C, 70);
 
     LOG.info("Setup top-level queues");
     
@@ -61,15 +79,20 @@ public class TestQueueParsing {
     final String A2 = A + ".a2";
     conf.setQueues(A, new String[] {"a1", "a2"});
     conf.setCapacity(A1, 30);
+    conf.setMaximumCapacity(A1, 45);
     conf.setCapacity(A2, 70);
+    conf.setMaximumCapacity(A2, 85);
     
     final String B1 = B + ".b1";
     final String B2 = B + ".b2";
     final String B3 = B + ".b3";
     conf.setQueues(B, new String[] {"b1", "b2", "b3"});
     conf.setCapacity(B1, 50);
+    conf.setMaximumCapacity(B1, 85);
     conf.setCapacity(B2, 30);
+    conf.setMaximumCapacity(B2, 35);
     conf.setCapacity(B3, 20);
+    conf.setMaximumCapacity(B3, 35);
 
     final String C1 = C + ".c1";
     final String C2 = C + ".c2";
@@ -77,9 +100,13 @@ public class TestQueueParsing {
     final String C4 = C + ".c4";
     conf.setQueues(C, new String[] {"c1", "c2", "c3", "c4"});
     conf.setCapacity(C1, 50);
+    conf.setMaximumCapacity(C1, 55);
     conf.setCapacity(C2, 10);
+    conf.setMaximumCapacity(C2, 25);
     conf.setCapacity(C3, 35);
+    conf.setMaximumCapacity(C3, 38);
     conf.setCapacity(C4, 5);
+    conf.setMaximumCapacity(C4, 5);
     
     LOG.info("Setup 2nd-level queues");
     
@@ -89,8 +116,11 @@ public class TestQueueParsing {
     final String C13 = C1 + ".c13";
     conf.setQueues(C1, new String[] {"c11", "c12", "c13"});
     conf.setCapacity(C11, 15);
+    conf.setMaximumCapacity(C11, 30);
     conf.setCapacity(C12, 45);
+    conf.setMaximumCapacity(C12, 70);
     conf.setCapacity(C13, 40);
+    conf.setMaximumCapacity(C13, 40);
     
     LOG.info("Setup 3rd-level queues");
   }

+ 13 - 10
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesCapacitySched.java

@@ -235,12 +235,13 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
         Element qElem = (Element) queues.item(j);
         String qName = WebServicesTestUtils.getXmlString(qElem, "queueName");
         String q = CapacitySchedulerConfiguration.ROOT + "." + qName;
-        verifySubQueueXML(qElem, q, 100);
+        verifySubQueueXML(qElem, q, 100, 100);
       }
     }
   }
 
-  public void verifySubQueueXML(Element qElem, String q, float parentAbsCapacity)
+  public void verifySubQueueXML(Element qElem, String q, 
+      float parentAbsCapacity, float parentAbsMaxCapacity)
       throws Exception {
     NodeList queues = qElem.getElementsByTagName("subQueues");
     QueueInfo qi = (queues != null) ? new QueueInfo() : new LeafQueueInfo();
@@ -258,14 +259,15 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
         WebServicesTestUtils.getXmlString(qElem, "usedResources");
     qi.queueName = WebServicesTestUtils.getXmlString(qElem, "queueName");
     qi.state = WebServicesTestUtils.getXmlString(qElem, "state");
-    verifySubQueueGeneric(q, qi, parentAbsCapacity);
+    verifySubQueueGeneric(q, qi, parentAbsCapacity, parentAbsMaxCapacity);
 
     if (queues != null) {
       for (int j = 0; j < queues.getLength(); j++) {
         Element subqElem = (Element) queues.item(j);
         String qName = WebServicesTestUtils.getXmlString(subqElem, "queueName");
         String q2 = q + "." + qName;
-        verifySubQueueXML(subqElem, q2, qi.absoluteCapacity);
+        verifySubQueueXML(subqElem, q2, 
+            qi.absoluteCapacity, qi.absoluteMaxCapacity);
       }
     } else {
       LeafQueueInfo lqi = (LeafQueueInfo) qi;
@@ -309,7 +311,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
     for (int i = 0; i < arr.length(); i++) {
       JSONObject obj = arr.getJSONObject(i);
       String q = CapacitySchedulerConfiguration.ROOT + "." + obj.getString("queueName");
-      verifySubQueue(obj, q, 100);
+      verifySubQueue(obj, q, 100, 100);
     }
   }
 
@@ -323,7 +325,8 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
     assertTrue("queueName doesn't match", "root".matches(queueName));
   }
 
-  private void verifySubQueue(JSONObject info, String q, float parentAbsCapacity)
+  private void verifySubQueue(JSONObject info, String q, 
+      float parentAbsCapacity, float parentAbsMaxCapacity)
       throws JSONException, Exception {
     int numExpectedElements = 11;
     boolean isParentQueue = true;
@@ -345,7 +348,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
     qi.queueName = info.getString("queueName");
     qi.state = info.getString("state");
 
-    verifySubQueueGeneric(q, qi, parentAbsCapacity);
+    verifySubQueueGeneric(q, qi, parentAbsCapacity, parentAbsMaxCapacity);
 
     if (isParentQueue) {
       JSONArray arr = info.getJSONArray("subQueues");
@@ -353,7 +356,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
       for (int i = 0; i < arr.length(); i++) {
         JSONObject obj = arr.getJSONObject(i);
         String q2 = q + "." + obj.getString("queueName");
-        verifySubQueue(obj, q2, qi.absoluteCapacity);
+        verifySubQueue(obj, q2, qi.absoluteCapacity, qi.absoluteMaxCapacity);
       }
     } else {
       LeafQueueInfo lqi = (LeafQueueInfo) qi;
@@ -371,7 +374,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
   }
 
   private void verifySubQueueGeneric(String q, QueueInfo info,
-      float parentAbsCapacity) throws Exception {
+      float parentAbsCapacity, float parentAbsMaxCapacity) throws Exception {
     String[] qArr = q.split("\\.");
     assertTrue("q name invalid: " + q, qArr.length > 1);
     String qshortName = qArr[qArr.length - 1];
@@ -380,7 +383,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
     assertEquals("capacity doesn't match", csConf.getCapacity(q),
         info.capacity, 1e-3f);
     float expectCapacity = csConf.getMaximumCapacity(q);
-    float expectAbsMaxCapacity = parentAbsCapacity * (info.maxCapacity/100);
+    float expectAbsMaxCapacity = parentAbsMaxCapacity * (info.maxCapacity/100);
     if (CapacitySchedulerConfiguration.UNDEFINED == expectCapacity) {
       expectCapacity = 100;
       expectAbsMaxCapacity = 100;

+ 8 - 5
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/src/main/java/org/apache/hadoop/yarn/server/webproxy/amfilter/AmIpFilter.java

@@ -57,7 +57,7 @@ public class AmIpFilter implements Filter {
     proxyUriBase = conf.getInitParameter(PROXY_URI_BASE);
   }
   
-  private Set<String> getProxyAddresses() throws ServletException {
+  protected Set<String> getProxyAddresses() throws ServletException {
     long now = System.currentTimeMillis();
     synchronized(this) {
       if(proxyAddresses == null || (lastUpdate + updateInterval) >= now) {
@@ -97,10 +97,13 @@ public class AmIpFilter implements Filter {
     }
     
     String user = null;
-    for(Cookie c: httpReq.getCookies()) {
-      if(WebAppProxyServlet.PROXY_USER_COOKIE_NAME.equals(c.getName())){
-        user = c.getValue();
-        break;
+    
+    if (httpReq.getCookies() != null) {
+      for(Cookie c: httpReq.getCookies()) {
+        if(WebAppProxyServlet.PROXY_USER_COOKIE_NAME.equals(c.getName())){
+          user = c.getValue();
+          break;
+        }
       }
     }
     if(user == null) {

+ 121 - 0
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/src/test/java/org/apache/hadoop/yarn/server/webproxy/amfilter/TestAmFilter.java

@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.webproxy.amfilter;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import javax.servlet.Filter;
+import javax.servlet.FilterChain;
+import javax.servlet.FilterConfig;
+import javax.servlet.ServletContext;
+import javax.servlet.ServletException;
+import javax.servlet.ServletRequest;
+import javax.servlet.ServletResponse;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+import org.mockito.Mockito;
+
+
+public class TestAmFilter  {
+
+  private String proxyHost = "bogushost.com";
+  private String proxyUri = "http://bogus";
+
+  private class TestAmIpFilter extends AmIpFilter {
+
+    private Set<String> proxyAddresses = null;
+
+    protected Set<String> getProxyAddresses() {
+      if(proxyAddresses == null) {
+        proxyAddresses = new HashSet<String>();
+      }
+      proxyAddresses.add(proxyHost);
+      return proxyAddresses;
+    }
+  }
+
+
+  private static class DummyFilterConfig implements FilterConfig {
+    final Map<String, String> map;
+
+
+    DummyFilterConfig(Map<String,String> map) {
+      this.map = map;
+    }
+
+    @Override
+    public String getFilterName() {
+      return "dummy";
+    }
+    @Override
+    public String getInitParameter(String arg0) {
+      return map.get(arg0);
+    }
+    @Override
+    public Enumeration<String> getInitParameterNames() {
+      return Collections.enumeration(map.keySet());
+    }
+    @Override
+    public ServletContext getServletContext() {
+      return null;
+    }
+  }
+
+
+  @Test
+  public void filterNullCookies() throws Exception {
+    HttpServletRequest request = Mockito.mock(HttpServletRequest.class);
+
+    Mockito.when(request.getCookies()).thenReturn(null);
+    Mockito.when(request.getRemoteAddr()).thenReturn(proxyHost);
+
+    HttpServletResponse response = Mockito.mock(HttpServletResponse.class);
+
+    final AtomicBoolean invoked = new AtomicBoolean();
+
+    FilterChain chain = new FilterChain() {
+      @Override
+      public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse)
+        throws IOException, ServletException {
+        invoked.set(true);
+      }
+    };
+
+    Map<String, String> params = new HashMap<String, String>();
+    params.put(AmIpFilter.PROXY_HOST, proxyHost);
+    params.put(AmIpFilter.PROXY_URI_BASE, proxyUri);
+    FilterConfig conf = new DummyFilterConfig(params);
+    Filter filter = new TestAmIpFilter();
+    filter.init(conf);
+    filter.doFilter(request, response, chain);
+    Assert.assertTrue(invoked.get());
+    filter.destroy();
+  }
+}

+ 10 - 10
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/ClusterSetup.apt.vm

@@ -95,7 +95,7 @@ Hadoop MapReduce Next Generation - Cluster Setup
 *--------------------------------------+--------------------------------------+
 | DataNode                             | HADOOP_DATANODE_OPTS                 |
 *--------------------------------------+--------------------------------------+
-| Backup NameNode                      | HADOOP_SECONDARYNAMENODE_OPTS        |
+| Secondary NameNode                   | HADOOP_SECONDARYNAMENODE_OPTS        |
 *--------------------------------------+--------------------------------------+
 | ResourceManager                      | YARN_RESOURCEMANAGER_OPTS            |
 *--------------------------------------+--------------------------------------+
@@ -537,15 +537,15 @@ Hadoop MapReduce Next Generation - Cluster Setup
       
   It's recommended to have them share a Unix group, for e.g. <<<hadoop>>>.
       
-*--------------------------------------+--------------------------------------+
-|| User:Group                          || Daemons                             |
-*--------------------------------------+--------------------------------------+
-| hdfs:hadoop                          | NameNode, Backup NameNode, DataNode  |
-*--------------------------------------+--------------------------------------+
-| yarn:hadoop                          | ResourceManager, NodeManager         |
-*--------------------------------------+--------------------------------------+
-| mapred:hadoop                        | MapReduce JobHistory Server          |  
-*--------------------------------------+--------------------------------------+
+*--------------------------------------+----------------------------------------------------------------------+
+|| User:Group                          || Daemons                                                             |
+*--------------------------------------+----------------------------------------------------------------------+
+| hdfs:hadoop                          | NameNode, Secondary NameNode, Checkpoint Node, Backup Node, DataNode |
+*--------------------------------------+----------------------------------------------------------------------+
+| yarn:hadoop                          | ResourceManager, NodeManager                                         |
+*--------------------------------------+----------------------------------------------------------------------+
+| mapred:hadoop                        | MapReduce JobHistory Server                                          |
+*--------------------------------------+----------------------------------------------------------------------+
       
   * <<<Permissions for both HDFS and local fileSystem paths>>>
      

+ 49 - 0
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/WebApplicationProxy.apt.vm

@@ -0,0 +1,49 @@
+~~ Licensed under the Apache License, Version 2.0 (the "License");
+~~ you may not use this file except in compliance with the License.
+~~ You may obtain a copy of the License at
+~~
+~~   http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License. See accompanying LICENSE file.
+
+  ---
+  YARN
+  ---
+  ---
+  ${maven.build.timestamp}
+
+Web Application Proxy
+
+  The Web Application Proxy is part of YARN.  By default it will run as part of
+  the Resource Manager(RM), but can be configured to run in stand alone mode.
+  The reason for the proxy is to reduce the possibility of web based attacks
+  through YARN.
+
+  In YARN the Application Master(AM) has the responsibility to provide a web UI
+  and to send that link to the RM.  This opens up a number of potential
+  issues.  The RM runs as a trusted user, and people visiting that web
+  address will treat it, and links it provides to them as trusted, when in
+  reality the AM is running as a non-trusted user, and the links it gives to
+  the RM could point to anything malicious or otherwise.  The Web Application
+  Proxy mitigates this risk by warning users that do not own the given
+  application that they are connecting to an untrusted site.
+
+  In addition to this the proxy also tries to reduce the impact that a malicious
+  AM could have on a user.  It primarily does this by stripping out cookies from
+  the user, and replacing them with a single cookie providing the user name of
+  the logged in user.  This is because most web based authentication systems will
+  identify a user based off of a cookie.  By providing this cookie to an
+  untrusted application it opens up the potential for an exploit.  If the cookie
+  is designed properly that potential should be fairly minimal, but this is just
+  to reduce that potential attack vector.  The current proxy implementation does
+  nothing to prevent the AM from providing links to malicious external sites,
+  nor does it do anything to prevent malicious javascript code from running as
+  well.  In fact javascript can be used to get the cookies, so stripping the
+  cookies from the request has minimal benefit at this time.
+
+  In the future we hope to address the attack vectors described above and make
+  attaching to an AM's web UI safer.

+ 2 - 0
hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/index.apt.vm

@@ -47,4 +47,6 @@ MapReduce NextGen aka YARN aka MRv2
 
   * {{{./CapacityScheduler.html}Capacity Scheduler}}
 
+  * {{{./WebApplicationProxy.html}Web Application Proxy}}
+
 

+ 23 - 0
hadoop-project/pom.xml

@@ -223,6 +223,11 @@
         <artifactId>hadoop-archives</artifactId>
         <version>${project.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.apache.hadoop</groupId>
+        <artifactId>hadoop-distcp</artifactId>
+        <version>${project.version}</version>
+      </dependency>
       <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-rumen</artifactId>
@@ -709,11 +714,21 @@
           <artifactId>maven-project-info-reports-plugin</artifactId>
           <version>2.4</version>
         </plugin>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-resources-plugin</artifactId>
+          <version>2.2</version>
+        </plugin>
         <plugin>
           <groupId>org.codehaus.mojo</groupId>
           <artifactId>exec-maven-plugin</artifactId>
           <version>1.2</version>
         </plugin>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-pdf-plugin</artifactId>
+          <version>1.1</version>
+        </plugin>
       </plugins>
     </pluginManagement>
 
@@ -811,6 +826,14 @@
           </excludes>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-pdf-plugin</artifactId>
+        <configuration>
+          <outputDirectory>${project.reporting.outputDirectory}</outputDirectory>
+          <includeReports>false</includeReports>
+        </configuration>
+      </plugin>
     </plugins>
   </build>
 

+ 1 - 0
hadoop-project/src/site/site.xml

@@ -61,6 +61,7 @@
       <item name="YARN Architecture" href="hadoop-yarn/hadoop-yarn-site/YARN.html"/>
       <item name="Writing Yarn Applications" href="hadoop-yarn/hadoop-yarn-site/WritingYarnApplications.html"/>
       <item name="Capacity Scheduler" href="hadoop-yarn/hadoop-yarn-site/CapacityScheduler.html"/>
+      <item name="Web Application Proxy" href="hadoop-yarn/hadoop-yarn-site/WebApplicationProxy.html"/>
     </menu>
 
     <menu name="YARN REST API's" inherit="top">

+ 7 - 0
hadoop-tools/hadoop-distcp/README

@@ -0,0 +1,7 @@
+DistCp (distributed copy) is a tool used for large inter/intra-cluster copying. 
+It uses Map/Reduce to effect its distribution, error handling and recovery, 
+and reporting. It expands a list of files and directories into input to map tasks, 
+each of which will copy a partition of the files specified in the source list.
+
+Version 0.1 (2010/08/02 sriksun)
+ - Initial Version

+ 198 - 0
hadoop-tools/hadoop-distcp/pom.xml

@@ -0,0 +1,198 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<project>
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.hadoop</groupId>
+    <artifactId>hadoop-project</artifactId>
+    <version>0.23.1-SNAPSHOT</version>
+    <relativePath>../../hadoop-project</relativePath>
+  </parent>
+  <groupId>org.apache.hadoop</groupId>
+  <artifactId>hadoop-distcp</artifactId>
+  <version>0.23.1-SNAPSHOT</version>
+  <description>Apache Hadoop Distributed Copy</description>
+  <name>Apache Hadoop Distributed Copy</name>
+  <packaging>jar</packaging>
+
+  <properties>
+    <file.encoding>UTF-8</file.encoding>
+    <downloadSources>true</downloadSources>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-annotations</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-app</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-hs</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-core</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <resources>
+      <resource>
+        <directory>src/main/resources</directory>
+        <filtering>true</filtering>
+      </resource>
+    </resources>
+    <testResources>
+      <testResource>
+        <directory>src/test/resources</directory>
+        <filtering>true</filtering>
+      </testResource>
+    </testResources>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <configuration>
+          <forkMode>always</forkMode>
+          <forkedProcessTimeoutInSeconds>600</forkedProcessTimeoutInSeconds>
+          <argLine>-Xmx1024m</argLine>
+          <includes>
+            <include>**/Test*.java</include>
+          </includes>
+          <redirectTestOutputToFile>true</redirectTestOutputToFile>
+          <systemProperties>
+            <property>
+              <name>test.build.data</name>
+              <value>${basedir}/target/test/data</value>
+            </property>
+            <property>
+              <name>hadoop.log.dir</name>
+              <value>target/test/logs</value>
+            </property>
+            <property>
+              <name>org.apache.commons.logging.Log</name>
+              <value>org.apache.commons.logging.impl.SimpleLog</value>
+            </property>
+            <property>
+              <name>org.apache.commons.logging.simplelog.defaultlog</name>
+              <value>warn</value>
+            </property>
+          </systemProperties>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>copy-dependencies</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}/lib</outputDirectory>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-checkstyle-plugin</artifactId>
+        <configuration>
+          <enableRulesSummary>true</enableRulesSummary>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifest>
+              <mainClass>org.apache.hadoop.tools.DistCp</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-source-plugin</artifactId>
+        <configuration>
+          <attach>true</attach>
+        </configuration>
+        <executions>
+          <execution>
+            <goals>
+              <goal>jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-pdf-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>pdf</id>
+            <phase>package</phase>
+            <goals>
+              <goal>pdf</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>

+ 218 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyListing.java

@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.security.Credentials;
+
+import java.io.IOException;
+
+/**
+ * The CopyListing abstraction is responsible for how the list of
+ * sources and targets is constructed, for DistCp's copy function.
+ * The copy-listing should be a SequenceFile<Text, FileStatus>,
+ * located at the path specified to buildListing(),
+ * each entry being a pair of (Source relative path, source file status),
+ * all the paths being fully qualified.
+ */
+public abstract class CopyListing extends Configured {
+
+  private Credentials credentials;
+
+  /**
+   * Build listing function creates the input listing that distcp uses to
+   * perform the copy.
+   *
+   * The build listing is a sequence file that has relative path of a file in the key
+   * and the file status information of the source file in the value
+   *
+   * For instance if the source path is /tmp/data and the traversed path is
+   * /tmp/data/dir1/dir2/file1, then the sequence file would contain
+   *
+   * key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
+   *
+   * File would also contain directory entries. Meaning, if /tmp/data/dir1/dir2/file1
+   * is the only file under /tmp/data, the resulting sequence file would contain the
+   * following entries
+   *
+   * key: /dir1 and value: FileStatus(/tmp/data/dir1)
+   * key: /dir1/dir2 and value: FileStatus(/tmp/data/dir1/dir2)
+   * key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
+   *
+   * Cases requiring special handling:
+   * If source path is a file (/tmp/file1), contents of the file will be as follows
+   *
+   * TARGET DOES NOT EXIST: Key-"", Value-FileStatus(/tmp/file1)
+   * TARGET IS FILE       : Key-"", Value-FileStatus(/tmp/file1)
+   * TARGET IS DIR        : Key-"/file1", Value-FileStatus(/tmp/file1)  
+   *
+   * @param pathToListFile - Output file where the listing would be stored
+   * @param options - Input options to distcp
+   * @throws IOException - Exception if any
+   */
+  public final void buildListing(Path pathToListFile,
+                                 DistCpOptions options) throws IOException {
+    validatePaths(options);
+    doBuildListing(pathToListFile, options);
+    Configuration config = getConf();
+
+    config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, pathToListFile.toString());
+    config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, getBytesToCopy());
+    config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, getNumberOfPaths());
+
+    checkForDuplicates(pathToListFile);
+  }
+
+  /**
+   * Validate input and output paths
+   *
+   * @param options - Input options
+   * @throws InvalidInputException: If inputs are invalid
+   * @throws IOException: any Exception with FS 
+   */
+  protected abstract void validatePaths(DistCpOptions options)
+      throws IOException, InvalidInputException;
+
+  /**
+   * The interface to be implemented by sub-classes, to create the source/target file listing.
+   * @param pathToListFile Path on HDFS where the listing file is written.
+   * @param options Input Options for DistCp (indicating source/target paths.)
+   * @throws IOException: Thrown on failure to create the listing file.
+   */
+  protected abstract void doBuildListing(Path pathToListFile,
+                                         DistCpOptions options) throws IOException;
+
+  /**
+   * Return the total bytes that distCp should copy for the source paths
+   * This doesn't consider whether file is same should be skipped during copy
+   *
+   * @return total bytes to copy
+   */
+  protected abstract long getBytesToCopy();
+
+  /**
+   * Return the total number of paths to distcp, includes directories as well
+   * This doesn't consider whether file/dir is already present and should be skipped during copy
+   *
+   * @return Total number of paths to distcp
+   */
+  protected abstract long getNumberOfPaths();
+
+  /**
+   * Validate the final resulting path listing to see if there are any duplicate entries
+   *
+   * @param pathToListFile - path listing build by doBuildListing
+   * @throws IOException - Any issues while checking for duplicates and throws
+   * @throws DuplicateFileException - if there are duplicates
+   */
+  private void checkForDuplicates(Path pathToListFile)
+      throws DuplicateFileException, IOException {
+
+    Configuration config = getConf();
+    FileSystem fs = pathToListFile.getFileSystem(config);
+
+    Path sortedList = DistCpUtils.sortListing(fs, config, pathToListFile);
+
+    SequenceFile.Reader reader = new SequenceFile.Reader(
+                          config, SequenceFile.Reader.file(sortedList));
+    try {
+      Text lastKey = new Text("*"); //source relative path can never hold *
+      FileStatus lastFileStatus = new FileStatus();
+
+      Text currentKey = new Text();
+      while (reader.next(currentKey)) {
+        if (currentKey.equals(lastKey)) {
+          FileStatus currentFileStatus = new FileStatus();
+          reader.getCurrentValue(currentFileStatus);
+          throw new DuplicateFileException("File " + lastFileStatus.getPath() + " and " +
+              currentFileStatus.getPath() + " would cause duplicates. Aborting");
+        }
+        reader.getCurrentValue(lastFileStatus);
+        lastKey.set(currentKey);
+      }
+    } finally {
+      IOUtils.closeStream(reader);
+    }
+  }
+
+  /**
+   * Protected constructor, to initialize configuration.
+   * @param configuration The input configuration,
+   *                        with which the source/target FileSystems may be accessed.
+   * @param credentials - Credentials object on which the FS delegation tokens are cached.If null
+   * delegation token caching is skipped
+   */
+  protected CopyListing(Configuration configuration, Credentials credentials) {
+    setConf(configuration);
+    setCredentials(credentials);
+  }
+
+  /**
+   * set Credentials store, on which FS delegatin token will be cached
+   * @param credentials - Credentials object
+   */
+  protected void setCredentials(Credentials credentials) {
+    this.credentials = credentials;
+  }
+
+  /**
+   * get credentials to update the delegation tokens for accessed FS objects
+   * @return Credentials object
+   */
+  protected Credentials getCredentials() {
+    return credentials;
+  }
+
+  /**
+   * Public Factory method with which the appropriate CopyListing implementation may be retrieved.
+   * @param configuration The input configuration.
+   * @param credentials Credentials object on which the FS delegation tokens are cached
+   * @param options The input Options, to help choose the appropriate CopyListing Implementation.
+   * @return An instance of the appropriate CopyListing implementation.
+   */
+  public static CopyListing getCopyListing(Configuration configuration,
+                                           Credentials credentials,
+                                           DistCpOptions options) {
+    if (options.getSourceFileListing() == null) {
+      return new GlobbedCopyListing(configuration, credentials);
+    } else {
+      return new FileBasedCopyListing(configuration, credentials);
+    }
+  }
+
+  static class DuplicateFileException extends RuntimeException {
+    public DuplicateFileException(String message) {
+      super(message);
+    }
+  }
+
+  static class InvalidInputException extends RuntimeException {
+    public InvalidInputException(String message) {
+      super(message);
+    }
+  }
+}

+ 405 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCp.java

@@ -0,0 +1,405 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobSubmissionFiles;
+import org.apache.hadoop.mapreduce.Cluster;
+import org.apache.hadoop.tools.CopyListing.*;
+import org.apache.hadoop.tools.mapred.CopyMapper;
+import org.apache.hadoop.tools.mapred.CopyOutputFormat;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import java.io.IOException;
+import java.util.Random;
+
+/**
+ * DistCp is the main driver-class for DistCpV2.
+ * For command-line use, DistCp::main() orchestrates the parsing of command-line
+ * parameters and the launch of the DistCp job.
+ * For programmatic use, a DistCp object can be constructed by specifying
+ * options (in a DistCpOptions object), and DistCp::execute() may be used to
+ * launch the copy-job. DistCp may alternatively be sub-classed to fine-tune
+ * behaviour.
+ */
+public class DistCp extends Configured implements Tool {
+  private static final Log LOG = LogFactory.getLog(DistCp.class);
+
+  private DistCpOptions inputOptions;
+  private Path metaFolder;
+
+  private static final String PREFIX = "_distcp";
+  private static final String WIP_PREFIX = "._WIP_";
+  private static final String DISTCP_DEFAULT_XML = "distcp-default.xml";
+  public static final Random rand = new Random();
+
+  private boolean submitted;
+  private FileSystem jobFS;
+
+  /**
+   * Public Constructor. Creates DistCp object with specified input-parameters.
+   * (E.g. source-paths, target-location, etc.)
+   * @param inputOptions Options (indicating source-paths, target-location.)
+   * @param configuration The Hadoop configuration against which the Copy-mapper must run.
+   * @throws Exception, on failure.
+   */
+  public DistCp(Configuration configuration, DistCpOptions inputOptions) throws Exception {
+    Configuration config = new Configuration(configuration);
+    config.addResource(DISTCP_DEFAULT_XML);
+    setConf(config);
+    this.inputOptions = inputOptions;
+    this.metaFolder   = createMetaFolderPath();
+  }
+
+  /**
+   * To be used with the ToolRunner. Not for public consumption.
+   */
+  private DistCp() {}
+
+  /**
+   * Implementation of Tool::run(). Orchestrates the copy of source file(s)
+   * to target location, by:
+   *  1. Creating a list of files to be copied to target.
+   *  2. Launching a Map-only job to copy the files. (Delegates to execute().)
+   * @param argv List of arguments passed to DistCp, from the ToolRunner.
+   * @return On success, it returns 0. Else, -1.
+   */
+  public int run(String[] argv) {
+    try {
+      inputOptions = (OptionsParser.parse(argv));
+
+      LOG.info("Input Options: " + inputOptions);
+    } catch (Throwable e) {
+      LOG.error("Invalid arguments: ", e);
+      System.err.println("Invalid arguments: " + e.getMessage());
+      OptionsParser.usage();      
+      return DistCpConstants.INVALID_ARGUMENT;
+    }
+    
+    try {
+      execute();
+    } catch (InvalidInputException e) {
+      LOG.error("Invalid input: ", e);
+      return DistCpConstants.INVALID_ARGUMENT;
+    } catch (DuplicateFileException e) {
+      LOG.error("Duplicate files in input path: ", e);
+      return DistCpConstants.DUPLICATE_INPUT;
+    } catch (Exception e) {
+      LOG.error("Exception encountered ", e);
+      return DistCpConstants.UNKNOWN_ERROR;
+    }
+    return DistCpConstants.SUCCESS;
+  }
+
+  /**
+   * Implements the core-execution. Creates the file-list for copy,
+   * and launches the Hadoop-job, to do the copy.
+   * @return Job handle
+   * @throws Exception, on failure.
+   */
+  public Job execute() throws Exception {
+    assert inputOptions != null;
+    assert getConf() != null;
+
+    Job job = null;
+    try {
+      metaFolder = createMetaFolderPath();
+      jobFS = metaFolder.getFileSystem(getConf());
+
+      job = createJob();
+      createInputFileListing(job);
+
+      job.submit();
+      submitted = true;
+    } finally {
+      if (!submitted) {
+        cleanup();
+      }
+    }
+
+    String jobID = job.getJobID().toString();
+    job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID);
+    
+    LOG.info("DistCp job-id: " + jobID);
+    if (inputOptions.shouldBlock()) {
+      job.waitForCompletion(true);
+    }
+    return job;
+  }
+
+  /**
+   * Create Job object for submitting it, with all the configuration
+   *
+   * @return Reference to job object.
+   * @throws IOException - Exception if any
+   */
+  private Job createJob() throws IOException {
+    String jobName = "distcp";
+    String userChosenName = getConf().get(JobContext.JOB_NAME);
+    if (userChosenName != null)
+      jobName += ": " + userChosenName;
+    Job job = Job.getInstance(getConf());
+    job.setJobName(jobName);
+    job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions));
+    job.setJarByClass(CopyMapper.class);
+    configureOutputFormat(job);
+
+    job.setMapperClass(CopyMapper.class);
+    job.setNumReduceTasks(0);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(Text.class);
+    job.setOutputFormatClass(CopyOutputFormat.class);
+    job.getConfiguration().set(JobContext.MAP_SPECULATIVE, "false");
+    job.getConfiguration().set(JobContext.NUM_MAPS,
+                  String.valueOf(inputOptions.getMaxMaps()));
+
+    if (inputOptions.getSslConfigurationFile() != null) {
+      setupSSLConfig(job);
+    }
+
+    inputOptions.appendToConf(job.getConfiguration());
+    return job;
+  }
+
+  /**
+   * Setup ssl configuration on the job configuration to enable hsftp access
+   * from map job. Also copy the ssl configuration file to Distributed cache
+   *
+   * @param job - Reference to job's handle
+   * @throws java.io.IOException - Exception if unable to locate ssl config file
+   */
+  private void setupSSLConfig(Job job) throws IOException  {
+    Configuration configuration = job.getConfiguration();
+    Path sslConfigPath = new Path(configuration.
+        getResource(inputOptions.getSslConfigurationFile()).toString());
+
+    addSSLFilesToDistCache(job, sslConfigPath);
+    configuration.set(DistCpConstants.CONF_LABEL_SSL_CONF, sslConfigPath.getName());
+    configuration.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfigPath.getName());
+  }
+
+  /**
+   * Add SSL files to distributed cache. Trust store, key store and ssl config xml
+   *
+   * @param job - Job handle
+   * @param sslConfigPath - ssl Configuration file specified through options
+   * @throws IOException - If any
+   */
+  private void addSSLFilesToDistCache(Job job,
+                                      Path sslConfigPath) throws IOException {
+    Configuration configuration = job.getConfiguration();
+    FileSystem localFS = FileSystem.getLocal(configuration);
+
+    Configuration sslConf = new Configuration(false);
+    sslConf.addResource(sslConfigPath);
+
+    Path localStorePath = getLocalStorePath(sslConf,
+                            DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION);
+    job.addCacheFile(localStorePath.makeQualified(localFS.getUri(),
+                                      localFS.getWorkingDirectory()).toUri());
+    configuration.set(DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION,
+                      localStorePath.getName());
+
+    localStorePath = getLocalStorePath(sslConf,
+                             DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION);
+    job.addCacheFile(localStorePath.makeQualified(localFS.getUri(),
+                                      localFS.getWorkingDirectory()).toUri());
+    configuration.set(DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION,
+                                      localStorePath.getName());
+
+    job.addCacheFile(sslConfigPath.makeQualified(localFS.getUri(),
+                                      localFS.getWorkingDirectory()).toUri());
+
+  }
+
+  /**
+   * Get Local Trust store/key store path
+   *
+   * @param sslConf - Config from SSL Client xml
+   * @param storeKey - Key for either trust store or key store
+   * @return - Path where the store is present
+   * @throws IOException -If any
+   */
+  private Path getLocalStorePath(Configuration sslConf, String storeKey) throws IOException {
+    if (sslConf.get(storeKey) != null) {
+      return new Path(sslConf.get(storeKey));
+    } else {
+      throw new IOException("Store for " + storeKey + " is not set in " +
+          inputOptions.getSslConfigurationFile());
+    }
+  }
+
+  /**
+   * Setup output format appropriately
+   *
+   * @param job - Job handle
+   * @throws IOException - Exception if any
+   */
+  private void configureOutputFormat(Job job) throws IOException {
+    final Configuration configuration = job.getConfiguration();
+    Path targetPath = inputOptions.getTargetPath();
+    FileSystem targetFS = targetPath.getFileSystem(configuration);
+    targetPath = targetPath.makeQualified(targetFS.getUri(),
+                                          targetFS.getWorkingDirectory());
+
+    if (inputOptions.shouldAtomicCommit()) {
+      Path workDir = inputOptions.getAtomicWorkPath();
+      if (workDir == null) {
+        workDir = targetPath.getParent();
+      }
+      workDir = new Path(workDir, WIP_PREFIX + targetPath.getName()
+                                + rand.nextInt());
+      FileSystem workFS = workDir.getFileSystem(configuration);
+      if (!DistCpUtils.compareFs(targetFS, workFS)) {
+        throw new IllegalArgumentException("Work path " + workDir +
+            " and target path " + targetPath + " are in different file system");
+      }
+      CopyOutputFormat.setWorkingDirectory(job, workDir);
+    } else {
+      CopyOutputFormat.setWorkingDirectory(job, targetPath);
+    }
+    CopyOutputFormat.setCommitDirectory(job, targetPath);
+
+    Path logPath = inputOptions.getLogPath();
+    if (logPath == null) {
+      logPath = new Path(metaFolder, "_logs");
+    } else {
+      LOG.info("DistCp job log path: " + logPath);
+    }
+    CopyOutputFormat.setOutputPath(job, logPath);
+  }
+
+  /**
+   * Create input listing by invoking an appropriate copy listing
+   * implementation. Also add delegation tokens for each path
+   * to job's credential store
+   *
+   * @param job - Handle to job
+   * @return Returns the path where the copy listing is created
+   * @throws IOException - If any
+   */
+  private Path createInputFileListing(Job job) throws IOException {
+    Path fileListingPath = getFileListingPath();
+    CopyListing copyListing = CopyListing.getCopyListing(job.getConfiguration(),
+        job.getCredentials(), inputOptions);
+    copyListing.buildListing(fileListingPath, inputOptions);
+    return fileListingPath;
+  }
+
+  /**
+   * Get default name of the copy listing file. Use the meta folder
+   * to create the copy listing file
+   *
+   * @return - Path where the copy listing file has to be saved
+   * @throws IOException - Exception if any
+   */
+  private Path getFileListingPath() throws IOException {
+    String fileListPathStr = metaFolder + "/fileList.seq";
+    Path path = new Path(fileListPathStr);
+    return new Path(path.toUri().normalize().toString());
+  }
+
+  /**
+   * Create a default working folder for the job, under the
+   * job staging directory
+   *
+   * @return Returns the working folder information
+   * @throws Exception - EXception if any
+   */
+  private Path createMetaFolderPath() throws Exception {
+    Configuration configuration = getConf();
+    Path stagingDir = JobSubmissionFiles.getStagingDir(
+            new Cluster(configuration), configuration);
+    Path metaFolderPath = new Path(stagingDir, PREFIX + String.valueOf(rand.nextInt()));
+    if (LOG.isDebugEnabled())
+      LOG.debug("Meta folder location: " + metaFolderPath);
+    configuration.set(DistCpConstants.CONF_LABEL_META_FOLDER, metaFolderPath.toString());    
+    return metaFolderPath;
+  }
+
+  /**
+   * Main function of the DistCp program. Parses the input arguments (via OptionsParser),
+   * and invokes the DistCp::run() method, via the ToolRunner.
+   * @param argv Command-line arguments sent to DistCp.
+   */
+  public static void main(String argv[]) {
+    try {
+      DistCp distCp = new DistCp();
+      Cleanup CLEANUP = new Cleanup(distCp);
+
+      Runtime.getRuntime().addShutdownHook(CLEANUP);
+      System.exit(ToolRunner.run(getDefaultConf(), distCp, argv));
+    }
+    catch (Exception e) {
+      LOG.error("Couldn't complete DistCp operation: ", e);
+      System.exit(DistCpConstants.UNKNOWN_ERROR);
+    }
+  }
+
+  /**
+   * Loads properties from distcp-default.xml into configuration
+   * object
+   * @return Configuration which includes properties from distcp-default.xml
+   */
+  private static Configuration getDefaultConf() {
+    Configuration config = new Configuration();
+    config.addResource(DISTCP_DEFAULT_XML);
+    return config;
+  }
+
+  private synchronized void cleanup() {
+    try {
+      if (metaFolder == null) return;
+
+      jobFS.delete(metaFolder, true);
+      metaFolder = null;
+    } catch (IOException e) {
+      LOG.error("Unable to cleanup meta folder: " + metaFolder, e);
+    }
+  }
+
+  private boolean isSubmitted() {
+    return submitted;
+  }
+
+  private static class Cleanup extends Thread {
+    private final DistCp distCp;
+
+    public Cleanup(DistCp distCp) {
+      this.distCp = distCp;
+    }
+
+    @Override
+    public void run() {
+      if (distCp.isSubmitted()) return;
+
+      distCp.cleanup();
+    }
+  }
+}

+ 104 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java

@@ -0,0 +1,104 @@
+package org.apache.hadoop.tools;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Utility class to hold commonly used constants.
+ */
+public class DistCpConstants {
+
+  /* Default number of maps to use for DistCp */
+  public static final int DEFAULT_MAPS = 20;
+
+  /* Default bandwidth if none specified */
+  public static final int DEFAULT_BANDWIDTH_MB = 100;
+
+  /* Default strategy for copying. Implementation looked up
+     from distcp-default.xml
+   */
+  public static final String UNIFORMSIZE = "uniformsize";
+
+  /**
+   *  Constants mapping to command line switches/input options
+   */
+  public static final String CONF_LABEL_ATOMIC_COPY = "distcp.atomic.copy";
+  public static final String CONF_LABEL_WORK_PATH = "distcp.work.path";
+  public static final String CONF_LABEL_LOG_PATH = "distcp.log.path";
+  public static final String CONF_LABEL_IGNORE_FAILURES = "distcp.ignore.failures";
+  public static final String CONF_LABEL_PRESERVE_STATUS = "distcp.preserve.status";
+  public static final String CONF_LABEL_SYNC_FOLDERS = "distcp.sync.folders";
+  public static final String CONF_LABEL_DELETE_MISSING = "distcp.delete.missing.source";
+  public static final String CONF_LABEL_SSL_CONF = "distcp.keystore.resource";
+  public static final String CONF_LABEL_MAX_MAPS = "distcp.max.maps";
+  public static final String CONF_LABEL_SOURCE_LISTING = "distcp.source.listing";
+  public static final String CONF_LABEL_COPY_STRATEGY = "distcp.copy.strategy";
+  public static final String CONF_LABEL_SKIP_CRC = "distcp.skip.crc";
+  public static final String CONF_LABEL_OVERWRITE = "distcp.copy.overwrite";
+  public static final String CONF_LABEL_BANDWIDTH_MB = "distcp.map.bandwidth.mb";
+
+  /* Total bytes to be copied. Updated by copylisting. Unfiltered count */
+  public static final String CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED = "mapred.total.bytes.expected";
+
+  /* Total number of paths to copy, includes directories. Unfiltered count */
+  public static final String CONF_LABEL_TOTAL_NUMBER_OF_RECORDS = "mapred.number.of.records";
+
+  /* SSL keystore resource */
+  public static final String CONF_LABEL_SSL_KEYSTORE = "dfs.https.client.keystore.resource";
+
+  /* If input is based -f <<source listing>>, file containing the src paths */
+  public static final String CONF_LABEL_LISTING_FILE_PATH = "distcp.listing.file.path";
+
+  /* Directory where the mapreduce job will write to. If not atomic commit, then same
+    as CONF_LABEL_TARGET_FINAL_PATH
+   */
+  public static final String CONF_LABEL_TARGET_WORK_PATH = "distcp.target.work.path";
+
+  /* Directory where the final data will be committed to. If not atomic commit, then same
+    as CONF_LABEL_TARGET_WORK_PATH
+   */
+  public static final String CONF_LABEL_TARGET_FINAL_PATH = "distcp.target.final.path";
+
+  /**
+   * DistCp job id for consumers of the Disctp 
+   */
+  public static final String CONF_LABEL_DISTCP_JOB_ID = "distcp.job.id";
+
+  /* Meta folder where the job's intermediate data is kept */
+  public static final String CONF_LABEL_META_FOLDER = "distcp.meta.folder";
+
+  /**
+   * Conf label for SSL Trust-store location.
+   */
+  public static final String CONF_LABEL_SSL_TRUST_STORE_LOCATION
+      = "ssl.client.truststore.location";
+
+  /**
+   * Conf label for SSL Key-store location.
+   */
+  public static final String CONF_LABEL_SSL_KEY_STORE_LOCATION
+      = "ssl.client.keystore.location";
+
+  /**
+   * Constants for DistCp return code to shell / consumer of ToolRunner's run
+   */
+  public static final int SUCCESS = 0;
+  public static final int INVALID_ARGUMENT = -1;
+  public static final int DUPLICATE_INPUT = -2;
+  public static final int UNKNOWN_ERROR = -999;
+}

+ 218 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java

@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.cli.Option;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Enumeration mapping configuration keys to distcp command line
+ * options.
+ */
+public enum DistCpOptionSwitch {
+
+  /**
+   * Ignores any failures during copy, and continues with rest.
+   * Logs failures in a file
+   */
+  IGNORE_FAILURES(DistCpConstants.CONF_LABEL_IGNORE_FAILURES,
+      new Option("i", false, "Ignore failures during copy")),
+
+  /**
+   * Preserves status of file/path in the target.
+   * Default behavior with -p, is to preserve replication,
+   * block size, user, group and permission on the target file
+   *
+   * If any of the optional switches are present among rbugp, then
+   * only the corresponding file attribute is preserved
+   *
+   */
+  PRESERVE_STATUS(DistCpConstants.CONF_LABEL_PRESERVE_STATUS,
+      new Option("p", true, "preserve status (rbugp)" +
+          "(replication, block-size, user, group, permission)")),
+
+  /**
+   * Update target location by copying only files that are missing
+   * in the target. This can be used to periodically sync two folders
+   * across source and target. Typically used with DELETE_MISSING
+   * Incompatible with ATOMIC_COMMIT
+   */
+  SYNC_FOLDERS(DistCpConstants.CONF_LABEL_SYNC_FOLDERS, 
+      new Option("update", false, "Update target, copying only missing" +
+          "files or directories")),
+
+  /**
+   * Deletes missing files in target that are missing from source
+   * This allows the target to be in sync with the source contents
+   * Typically used in conjunction with SYNC_FOLDERS
+   * Incompatible with ATOMIC_COMMIT
+   */
+  DELETE_MISSING(DistCpConstants.CONF_LABEL_DELETE_MISSING,
+      new Option("delete", false, "Delete from target, " +
+          "files missing in source")),
+
+  /**
+   * Configuration file to use with hftps:// for securely copying
+   * files across clusters. Typically the configuration file contains
+   * truststore/keystore information such as location, password and type
+   */
+  SSL_CONF(DistCpConstants.CONF_LABEL_SSL_CONF,
+      new Option("mapredSslConf", true, "Configuration for ssl config file" +
+          ", to use with hftps://")),
+
+  /**
+   * Max number of maps to use during copy. DistCp will split work
+   * as equally as possible among these maps
+   */
+  MAX_MAPS(DistCpConstants.CONF_LABEL_MAX_MAPS, 
+      new Option("m", true, "Max number of concurrent maps to use for copy")),
+
+  /**
+   * Source file listing can be provided to DistCp in a file.
+   * This allows DistCp to copy random list of files from source
+   * and copy them to target
+   */
+  SOURCE_FILE_LISTING(DistCpConstants.CONF_LABEL_SOURCE_LISTING,
+      new Option("f", true, "List of files that need to be copied")),
+
+  /**
+   * Copy all the source files and commit them atomically to the target
+   * This is typically useful in cases where there is a process
+   * polling for availability of a file/dir. This option is incompatible
+   * with SYNC_FOLDERS & DELETE_MISSING
+   */
+  ATOMIC_COMMIT(DistCpConstants.CONF_LABEL_ATOMIC_COPY,
+      new Option("atomic", false, "Commit all changes or none")),
+
+  /**
+   * Work path to be used only in conjunction in Atomic commit
+   */
+  WORK_PATH(DistCpConstants.CONF_LABEL_WORK_PATH,
+      new Option("tmp", true, "Intermediate work path to be used for atomic commit")),
+
+  /**
+   * Log path where distcp output logs are written to
+   */
+  LOG_PATH(DistCpConstants.CONF_LABEL_LOG_PATH,
+      new Option("log", true, "Folder on DFS where distcp execution logs are saved")),
+
+  /**
+   * Copy strategy is use. This could be dynamic or uniform size etc.
+   * DistCp would use an appropriate input format based on this.
+   */
+  COPY_STRATEGY(DistCpConstants.CONF_LABEL_COPY_STRATEGY,
+      new Option("strategy", true, "Copy strategy to use. Default is " +
+          "dividing work based on file sizes")),
+
+  /**
+   * Skip CRC checks between source and target, when determining what
+   * files need to be copied.
+   */
+  SKIP_CRC(DistCpConstants.CONF_LABEL_SKIP_CRC,
+      new Option("skipcrccheck", false, "Whether to skip CRC checks between " +
+          "source and target paths.")),
+
+  /**
+   * Overwrite target-files unconditionally.
+   */
+  OVERWRITE(DistCpConstants.CONF_LABEL_OVERWRITE,
+      new Option("overwrite", false, "Choose to overwrite target files " +
+          "unconditionally, even if they exist.")),
+
+  /**
+   * Should DisctpExecution be blocking
+   */
+  BLOCKING("",
+      new Option("async", false, "Should distcp execution be blocking")),
+
+  FILE_LIMIT("",
+      new Option("filelimit", true, "(Deprecated!) Limit number of files " +
+              "copied to <= n")),
+
+  SIZE_LIMIT("",
+      new Option("sizelimit", true, "(Deprecated!) Limit number of files " +
+              "copied to <= n bytes")),
+
+  /**
+   * Specify bandwidth per map in MB
+   */
+  BANDWIDTH(DistCpConstants.CONF_LABEL_BANDWIDTH_MB,
+      new Option("bandwidth", true, "Specify bandwidth per map in MB"));
+
+  private final String confLabel;
+  private final Option option;
+
+  DistCpOptionSwitch(String confLabel, Option option) {
+    this.confLabel = confLabel;
+    this.option = option;
+  }
+
+  /**
+   * Get Configuration label for the option
+   * @return configuration label name
+   */
+  public String getConfigLabel() {
+    return confLabel;
+  }
+
+  /**
+   * Get CLI Option corresponding to the distcp option
+   * @return option
+   */
+  public Option getOption() {
+    return option;
+  }
+
+  /**
+   * Get Switch symbol
+   * @return switch symbol char
+   */
+  public String getSwitch() {
+    return option.getOpt();
+  }
+
+  @Override
+  public String toString() {
+    return  super.name() + " {" +
+        "confLabel='" + confLabel + '\'' +
+        ", option=" + option + '}';
+  }
+
+  /**
+   * Helper function to add an option to hadoop configuration object
+   * @param conf - Configuration object to include the option
+   * @param option - Option to add
+   * @param value - Value
+   */
+  public static void addToConf(Configuration conf,
+                               DistCpOptionSwitch option,
+                               String value) {
+    conf.set(option.getConfigLabel(), value);
+  }
+
+  /**
+   * Helper function to set an option to hadoop configuration object
+   * @param conf - Configuration object to include the option
+   * @param option - Option to add
+   */
+  public static void addToConf(Configuration conf,
+                               DistCpOptionSwitch option) {
+    conf.set(option.getConfigLabel(), "true");
+  }
+}

+ 525 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java

@@ -0,0 +1,525 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.tools.util.DistCpUtils;
+
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+/**
+ * The Options class encapsulates all DistCp options.
+ * These may be set from command-line (via the OptionsParser)
+ * or may be set manually.
+ */
+public class DistCpOptions {
+
+  private boolean atomicCommit = false;
+  private boolean syncFolder = false;
+  private boolean deleteMissing = false;
+  private boolean ignoreFailures = false;
+  private boolean overwrite = false;
+  private boolean skipCRC = false;
+  private boolean blocking = true;
+
+  private int maxMaps = DistCpConstants.DEFAULT_MAPS;
+  private int mapBandwidth = DistCpConstants.DEFAULT_BANDWIDTH_MB;
+
+  private String sslConfigurationFile;
+
+  private String copyStrategy = DistCpConstants.UNIFORMSIZE;
+
+  private EnumSet<FileAttribute> preserveStatus = EnumSet.noneOf(FileAttribute.class);
+
+  private Path atomicWorkPath;
+
+  private Path logPath;
+
+  private Path sourceFileListing;
+  private List<Path> sourcePaths;
+
+  private Path targetPath;
+
+  public static enum FileAttribute{
+    REPLICATION, BLOCKSIZE, USER, GROUP, PERMISSION;
+
+    public static FileAttribute getAttribute(char symbol) {
+      for (FileAttribute attribute : values()) {
+        if (attribute.name().charAt(0) == Character.toUpperCase(symbol)) {
+          return attribute;
+        }
+      }
+      throw new NoSuchElementException("No attribute for " + symbol);
+    }
+  }
+
+  /**
+   * Constructor, to initialize source/target paths.
+   * @param sourcePaths List of source-paths (including wildcards)
+   *                     to be copied to target.
+   * @param targetPath Destination path for the dist-copy.
+   */
+  public DistCpOptions(List<Path> sourcePaths, Path targetPath) {
+    assert sourcePaths != null && !sourcePaths.isEmpty() : "Invalid source paths";
+    assert targetPath != null : "Invalid Target path";
+
+    this.sourcePaths = sourcePaths;
+    this.targetPath = targetPath;
+  }
+
+  /**
+   * Constructor, to initialize source/target paths.
+   * @param sourceFileListing File containing list of source paths
+   * @param targetPath Destination path for the dist-copy.
+   */
+  public DistCpOptions(Path sourceFileListing, Path targetPath) {
+    assert sourceFileListing != null : "Invalid source paths";
+    assert targetPath != null : "Invalid Target path";
+
+    this.sourceFileListing = sourceFileListing;
+    this.targetPath = targetPath;
+  }
+
+  /**
+   * Copy constructor.
+   * @param that DistCpOptions being copied from.
+   */
+  public DistCpOptions(DistCpOptions that) {
+    if (this != that && that != null) {
+      this.atomicCommit = that.atomicCommit;
+      this.syncFolder = that.syncFolder;
+      this.deleteMissing = that.deleteMissing;
+      this.ignoreFailures = that.ignoreFailures;
+      this.overwrite = that.overwrite;
+      this.skipCRC = that.skipCRC;
+      this.blocking = that.blocking;
+      this.maxMaps = that.maxMaps;
+      this.mapBandwidth = that.mapBandwidth;
+      this.sslConfigurationFile = that.getSslConfigurationFile();
+      this.copyStrategy = that.copyStrategy;
+      this.preserveStatus = that.preserveStatus;
+      this.atomicWorkPath = that.getAtomicWorkPath();
+      this.logPath = that.getLogPath();
+      this.sourceFileListing = that.getSourceFileListing();
+      this.sourcePaths = that.getSourcePaths();
+      this.targetPath = that.getTargetPath();
+    }
+  }
+
+  /**
+   * Should the data be committed atomically?
+   *
+   * @return true if data should be committed automically. false otherwise
+   */
+  public boolean shouldAtomicCommit() {
+    return atomicCommit;
+  }
+
+  /**
+   * Set if data need to be committed automatically
+   *
+   * @param atomicCommit - boolean switch
+   */
+  public void setAtomicCommit(boolean atomicCommit) {
+    validate(DistCpOptionSwitch.ATOMIC_COMMIT, atomicCommit);
+    this.atomicCommit = atomicCommit;
+  }
+
+  /**
+   * Should the data be sync'ed between source and target paths?
+   *
+   * @return true if data should be sync'ed up. false otherwise
+   */
+  public boolean shouldSyncFolder() {
+    return syncFolder;
+  }
+
+  /**
+   * Set if source and target folder contents be sync'ed up
+   *
+   * @param syncFolder - boolean switch
+   */
+  public void setSyncFolder(boolean syncFolder) {
+    validate(DistCpOptionSwitch.SYNC_FOLDERS, syncFolder);
+    this.syncFolder = syncFolder;
+  }
+
+  /**
+   * Should target files missing in source should be deleted?
+   *
+   * @return true if zoombie target files to be removed. false otherwise
+   */
+  public boolean shouldDeleteMissing() {
+    return deleteMissing;
+  }
+
+  /**
+   * Set if files only present in target should be deleted
+   *
+   * @param deleteMissing - boolean switch
+   */
+  public void setDeleteMissing(boolean deleteMissing) {
+    validate(DistCpOptionSwitch.DELETE_MISSING, deleteMissing);
+    this.deleteMissing = deleteMissing;
+  }
+
+  /**
+   * Should failures be logged and ignored during copy?
+   *
+   * @return true if failures are to be logged and ignored. false otherwise
+   */
+  public boolean shouldIgnoreFailures() {
+    return ignoreFailures;
+  }
+
+  /**
+   * Set if failures during copy be ignored
+   *
+   * @param ignoreFailures - boolean switch
+   */
+  public void setIgnoreFailures(boolean ignoreFailures) {
+    this.ignoreFailures = ignoreFailures;
+  }
+
+  /**
+   * Should DistCp be running in blocking mode
+   *
+   * @return true if should run in blocking, false otherwise
+   */
+  public boolean shouldBlock() {
+    return blocking;
+  }
+
+  /**
+   * Set if Disctp should run blocking or non-blocking
+   *
+   * @param blocking - boolean switch
+   */
+  public void setBlocking(boolean blocking) {
+    this.blocking = blocking;
+  }
+
+  /**
+   * Should files be overwritten always?
+   *
+   * @return true if files in target that may exist before distcp, should always
+   *         be overwritten. false otherwise
+   */
+  public boolean shouldOverwrite() {
+    return overwrite;
+  }
+
+  /**
+   * Set if files should always be overwritten on target
+   *
+   * @param overwrite - boolean switch
+   */
+  public void setOverwrite(boolean overwrite) {
+    validate(DistCpOptionSwitch.OVERWRITE, overwrite);
+    this.overwrite = overwrite;
+  }
+
+  /**
+   * Should CRC/checksum check be skipped while checking files are identical
+   *
+   * @return true if checksum check should be skipped while checking files are
+   *         identical. false otherwise
+   */
+  public boolean shouldSkipCRC() {
+    return skipCRC;
+  }
+
+  /**
+   * Set if checksum comparison should be skipped while determining if
+   * source and destination files are identical
+   *
+   * @param skipCRC - boolean switch
+   */
+  public void setSkipCRC(boolean skipCRC) {
+    validate(DistCpOptionSwitch.SKIP_CRC, skipCRC);
+    this.skipCRC = skipCRC;
+  }
+
+  /** Get the max number of maps to use for this copy
+   *
+   * @return Max number of maps
+   */
+  public int getMaxMaps() {
+    return maxMaps;
+  }
+
+  /**
+   * Set the max number of maps to use for copy
+   *
+   * @param maxMaps - Number of maps
+   */
+  public void setMaxMaps(int maxMaps) {
+    this.maxMaps = maxMaps;
+  }
+
+  /** Get the map bandwidth in MB
+   *
+   * @return Bandwidth in MB
+   */
+  public int getMapBandwidth() {
+    return mapBandwidth;
+  }
+
+  /**
+   * Set per map bandwidth
+   *
+   * @param mapBandwidth - per map bandwidth
+   */
+  public void setMapBandwidth(int mapBandwidth) {
+    assert mapBandwidth > 0 : "Bandwidth " + mapBandwidth + " is invalid (should be > 0)";
+    this.mapBandwidth = mapBandwidth;
+  }
+
+  /**
+   * Get path where the ssl configuration file is present to use for hftps://
+   *
+   * @return Path on local file system
+   */
+  public String getSslConfigurationFile() {
+    return sslConfigurationFile;
+  }
+
+  /**
+   * Set the SSL configuration file path to use with hftps:// (local path)
+   *
+   * @param sslConfigurationFile - Local ssl config file path
+   */
+  public void setSslConfigurationFile(String sslConfigurationFile) {
+    this.sslConfigurationFile = sslConfigurationFile;
+  }
+
+  /**
+   * Returns an iterator with the list of file attributes to preserve
+   *
+   * @return iterator of file attributes to preserve
+   */
+  public Iterator<FileAttribute> preserveAttributes() {
+    return preserveStatus.iterator();
+  }
+
+  /**
+   * Checks if the input attibute should be preserved or not
+   *
+   * @param attribute - Attribute to check
+   * @return True if attribute should be preserved, false otherwise
+   */
+  public boolean shouldPreserve(FileAttribute attribute) {
+    return preserveStatus.contains(attribute);
+  }
+
+  /**
+   * Add file attributes that need to be preserved. This method may be
+   * called multiple times to add attributes.
+   *
+   * @param fileAttribute - Attribute to add, one at a time
+   */
+  public void preserve(FileAttribute fileAttribute) {
+    for (FileAttribute attribute : preserveStatus) {
+      if (attribute.equals(fileAttribute)) {
+        return;
+      }
+    }
+    preserveStatus.add(fileAttribute);
+  }
+
+  /** Get work path for atomic commit. If null, the work
+   * path would be parentOf(targetPath) + "/._WIP_" + nameOf(targetPath)
+   *
+   * @return Atomic work path on the target cluster. Null if not set
+   */
+  public Path getAtomicWorkPath() {
+    return atomicWorkPath;
+  }
+
+  /**
+   * Set the work path for atomic commit
+   *
+   * @param atomicWorkPath - Path on the target cluster
+   */
+  public void setAtomicWorkPath(Path atomicWorkPath) {
+    this.atomicWorkPath = atomicWorkPath;
+  }
+
+  /** Get output directory for writing distcp logs. Otherwise logs
+   * are temporarily written to JobStagingDir/_logs and deleted
+   * upon job completion
+   *
+   * @return Log output path on the cluster where distcp job is run
+   */
+  public Path getLogPath() {
+    return logPath;
+  }
+
+  /**
+   * Set the log path where distcp output logs are stored
+   * Uses JobStagingDir/_logs by default
+   *
+   * @param logPath - Path where logs will be saved
+   */
+  public void setLogPath(Path logPath) {
+    this.logPath = logPath;
+  }
+
+  /**
+   * Get the copy strategy to use. Uses appropriate input format
+   *
+   * @return copy strategy to use
+   */
+  public String getCopyStrategy() {
+    return copyStrategy;
+  }
+
+  /**
+   * Set the copy strategy to use. Should map to a strategy implementation
+   * in distp-default.xml
+   *
+   * @param copyStrategy - copy Strategy to use
+   */
+  public void setCopyStrategy(String copyStrategy) {
+    this.copyStrategy = copyStrategy;
+  }
+
+  /**
+   * File path (hdfs:// or file://) that contains the list of actual
+   * files to copy
+   *
+   * @return - Source listing file path
+   */
+  public Path getSourceFileListing() {
+    return sourceFileListing;
+  }
+
+  /**
+   * Getter for sourcePaths.
+   * @return List of source-paths.
+   */
+  public List<Path> getSourcePaths() {
+    return sourcePaths;
+  }
+
+  /**
+   * Setter for sourcePaths.
+   * @param sourcePaths The new list of source-paths.
+   */
+  public void setSourcePaths(List<Path> sourcePaths) {
+    assert sourcePaths != null && sourcePaths.size() != 0;
+    this.sourcePaths = sourcePaths;
+  }
+
+  /**
+   * Getter for the targetPath.
+   * @return The target-path.
+   */
+  public Path getTargetPath() {
+    return targetPath;
+  }
+
+  public void validate(DistCpOptionSwitch option, boolean value) {
+
+    boolean syncFolder = (option == DistCpOptionSwitch.SYNC_FOLDERS ?
+        value : this.syncFolder);
+    boolean overwrite = (option == DistCpOptionSwitch.OVERWRITE ?
+        value : this.overwrite);
+    boolean deleteMissing = (option == DistCpOptionSwitch.DELETE_MISSING ?
+        value : this.deleteMissing);
+    boolean atomicCommit = (option == DistCpOptionSwitch.ATOMIC_COMMIT ?
+        value : this.atomicCommit);
+    boolean skipCRC = (option == DistCpOptionSwitch.SKIP_CRC ?
+        value : this.skipCRC);
+
+    if (syncFolder && atomicCommit) {
+      throw new IllegalArgumentException("Atomic commit can't be used with " +
+          "sync folder or overwrite options");
+    }
+
+    if (deleteMissing && !(overwrite || syncFolder)) {
+      throw new IllegalArgumentException("Delete missing is applicable " +
+          "only with update or overwrite options");
+    }
+
+    if (overwrite && syncFolder) {
+      throw new IllegalArgumentException("Overwrite and update options are " +
+          "mutually exclusive");
+    }
+
+    if (!syncFolder && skipCRC) {
+      throw new IllegalArgumentException("Skip CRC is valid only with update options");
+    }
+
+  }
+
+  /**
+   * Add options to configuration. These will be used in the Mapper/committer
+   *
+   * @param conf - Configruation object to which the options need to be added
+   */
+  public void appendToConf(Configuration conf) {
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.ATOMIC_COMMIT,
+        String.valueOf(atomicCommit));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.IGNORE_FAILURES,
+        String.valueOf(ignoreFailures));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SYNC_FOLDERS,
+        String.valueOf(syncFolder));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.DELETE_MISSING,
+        String.valueOf(deleteMissing));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.OVERWRITE,
+        String.valueOf(overwrite));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SKIP_CRC,
+        String.valueOf(skipCRC));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.BANDWIDTH,
+        String.valueOf(mapBandwidth));
+    DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.PRESERVE_STATUS,
+        DistCpUtils.packAttributes(preserveStatus));
+  }
+
+  /**
+   * Utility to easily string-ify Options, for logging.
+   *
+   * @return String representation of the Options.
+   */
+  @Override
+  public String toString() {
+    return "DistCpOptions{" +
+        "atomicCommit=" + atomicCommit +
+        ", syncFolder=" + syncFolder +
+        ", deleteMissing=" + deleteMissing +
+        ", ignoreFailures=" + ignoreFailures +
+        ", maxMaps=" + maxMaps +
+        ", sslConfigurationFile='" + sslConfigurationFile + '\'' +
+        ", copyStrategy='" + copyStrategy + '\'' +
+        ", sourceFileListing=" + sourceFileListing +
+        ", sourcePaths=" + sourcePaths +
+        ", targetPath=" + targetPath +
+        '}';
+  }
+
+  @Override
+  protected DistCpOptions clone() throws CloneNotSupportedException {
+    return (DistCpOptions) super.clone();
+  }
+}

+ 100 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/FileBasedCopyListing.java

@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.security.Credentials;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * FileBasedCopyListing implements the CopyListing interface,
+ * to create the copy-listing for DistCp,
+ * by iterating over all source paths mentioned in a specified input-file.
+ */
+public class FileBasedCopyListing extends CopyListing {
+
+  private final CopyListing globbedListing;
+  /**
+   * Constructor, to initialize base-class.
+   * @param configuration The input Configuration object.
+   * @param credentials - Credentials object on which the FS delegation tokens are cached. If null
+   * delegation token caching is skipped
+   */
+  public FileBasedCopyListing(Configuration configuration, Credentials credentials) {
+    super(configuration, credentials);
+    globbedListing = new GlobbedCopyListing(getConf(), credentials);
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected void validatePaths(DistCpOptions options)
+      throws IOException, InvalidInputException {
+  }
+
+  /**
+   * Implementation of CopyListing::buildListing().
+   *   Iterates over all source paths mentioned in the input-file.
+   * @param pathToListFile Path on HDFS where the listing file is written.
+   * @param options Input Options for DistCp (indicating source/target paths.)
+   * @throws IOException
+   */
+  @Override
+  public void doBuildListing(Path pathToListFile, DistCpOptions options) throws IOException {
+    DistCpOptions newOption = new DistCpOptions(options);
+    newOption.setSourcePaths(fetchFileList(options.getSourceFileListing()));
+    globbedListing.buildListing(pathToListFile, newOption);
+  }
+
+  private List<Path> fetchFileList(Path sourceListing) throws IOException {
+    List<Path> result = new ArrayList<Path>();
+    FileSystem fs = sourceListing.getFileSystem(getConf());
+    BufferedReader input = null;
+    try {
+      input = new BufferedReader(new InputStreamReader(fs.open(sourceListing)));
+      String line = input.readLine();
+      while (line != null) {
+        result.add(new Path(line));
+        line = input.readLine();
+      }
+    } finally {
+      IOUtils.closeStream(input);
+    }
+    return result;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getBytesToCopy() {
+    return globbedListing.getBytesToCopy();
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getNumberOfPaths() {
+    return globbedListing.getNumberOfPaths();
+  }
+}

+ 105 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java

@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.security.Credentials;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * GlobbedCopyListing implements the CopyListing interface, to create the copy
+ * listing-file by "globbing" all specified source paths (wild-cards and all.)
+ */
+public class GlobbedCopyListing extends CopyListing {
+  private static final Log LOG = LogFactory.getLog(GlobbedCopyListing.class);
+
+  private final CopyListing simpleListing;
+  /**
+   * Constructor, to initialize the configuration.
+   * @param configuration The input Configuration object.
+   * @param credentials Credentials object on which the FS delegation tokens are cached. If null
+   * delegation token caching is skipped
+   */
+  public GlobbedCopyListing(Configuration configuration, Credentials credentials) {
+    super(configuration, credentials);
+    simpleListing = new SimpleCopyListing(getConf(), credentials) ;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected void validatePaths(DistCpOptions options)
+      throws IOException, InvalidInputException {
+  }
+
+  /**
+   * Implementation of CopyListing::buildListing().
+   * Creates the copy listing by "globbing" all source-paths.
+   * @param pathToListingFile The location at which the copy-listing file
+   *                           is to be created.
+   * @param options Input Options for DistCp (indicating source/target paths.)
+   * @throws IOException
+   */
+  @Override
+  public void doBuildListing(Path pathToListingFile,
+                             DistCpOptions options) throws IOException {
+
+    List<Path> globbedPaths = new ArrayList<Path>();
+    if (options.getSourcePaths().isEmpty()) {
+      throw new InvalidInputException("Nothing to process. Source paths::EMPTY");  
+    }
+
+    for (Path p : options.getSourcePaths()) {
+      FileSystem fs = p.getFileSystem(getConf());
+      FileStatus[] inputs = fs.globStatus(p);
+
+      if(inputs != null && inputs.length > 0) {
+        for (FileStatus onePath: inputs) {
+          globbedPaths.add(onePath.getPath());
+        }
+      } else {
+        throw new InvalidInputException(p + " doesn't exist");        
+      }
+    }
+
+    DistCpOptions optionsGlobbed = new DistCpOptions(options);
+    optionsGlobbed.setSourcePaths(globbedPaths);
+    simpleListing.buildListing(pathToListingFile, optionsGlobbed);
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getBytesToCopy() {
+    return simpleListing.getBytesToCopy();
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getNumberOfPaths() {
+    return simpleListing.getNumberOfPaths();
+  }
+
+}

+ 246 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java

@@ -0,0 +1,246 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.cli.*;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+
+import java.util.*;
+
+/**
+ * The OptionsParser parses out the command-line options passed to DistCp,
+ * and interprets those specific to DistCp, to create an Options object.
+ */
+public class OptionsParser {
+
+  private static final Log LOG = LogFactory.getLog(OptionsParser.class);
+
+  private static final Options cliOptions = new Options();      
+
+  static {
+    for (DistCpOptionSwitch option : DistCpOptionSwitch.values()) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Adding option " + option.getOption());
+      }
+      cliOptions.addOption(option.getOption());
+    }
+  }
+
+  private static class CustomParser extends GnuParser {
+    @Override
+    protected String[] flatten(Options options, String[] arguments, boolean stopAtNonOption) {
+      for (int index = 0; index < arguments.length; index++) {
+        if (arguments[index].equals("-" + DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) {
+          arguments[index] = "-prbugp";
+        }
+      }
+      return super.flatten(options, arguments, stopAtNonOption);
+    }
+  }
+
+  /**
+   * The parse method parses the command-line options, and creates
+   * a corresponding Options object.
+   * @param args Command-line arguments (excluding the options consumed
+   *              by the GenericOptionsParser).
+   * @return The Options object, corresponding to the specified command-line.
+   * @throws IllegalArgumentException: Thrown if the parse fails.
+   */
+  public static DistCpOptions parse(String args[]) throws IllegalArgumentException {
+
+    CommandLineParser parser = new CustomParser();
+
+    CommandLine command;
+    try {
+      command = parser.parse(cliOptions, args, true);
+    } catch (ParseException e) {
+      throw new IllegalArgumentException("Unable to parse arguments. " +
+        Arrays.toString(args), e);
+    }
+
+    DistCpOptions option;
+    Path targetPath;
+    List<Path> sourcePaths = new ArrayList<Path>();
+
+    String leftOverArgs[] = command.getArgs();
+    if (leftOverArgs == null || leftOverArgs.length < 1) {
+      throw new IllegalArgumentException("Target path not specified");
+    }
+
+    //Last Argument is the target path
+    targetPath = new Path(leftOverArgs[leftOverArgs.length -1].trim());
+
+    //Copy any source paths in the arguments to the list
+    for (int index = 0; index < leftOverArgs.length - 1; index++) {
+      sourcePaths.add(new Path(leftOverArgs[index].trim()));
+    }
+
+    /* If command has source file listing, use it else, fall back on source paths in args
+       If both are present, throw exception and bail */
+    if (command.hasOption(DistCpOptionSwitch.SOURCE_FILE_LISTING.getSwitch())) {
+      if (!sourcePaths.isEmpty()) {
+        throw new IllegalArgumentException("Both source file listing and source paths present");
+      }
+      option = new DistCpOptions(new Path(getVal(command, DistCpOptionSwitch.
+              SOURCE_FILE_LISTING.getSwitch())), targetPath);
+    } else {
+      if (sourcePaths.isEmpty()) {
+        throw new IllegalArgumentException("Neither source file listing nor source paths present");
+      }
+      option = new DistCpOptions(sourcePaths, targetPath);
+    }
+
+    //Process all the other option switches and set options appropriately
+    if (command.hasOption(DistCpOptionSwitch.IGNORE_FAILURES.getSwitch())) {
+      option.setIgnoreFailures(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.ATOMIC_COMMIT.getSwitch())) {
+      option.setAtomicCommit(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch()) &&
+        option.shouldAtomicCommit()) {
+      String workPath = getVal(command, DistCpOptionSwitch.WORK_PATH.getSwitch());
+      if (workPath != null && !workPath.isEmpty()) {
+        option.setAtomicWorkPath(new Path(workPath));
+      }
+    } else if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch())) {
+      throw new IllegalArgumentException("-tmp work-path can only be specified along with -atomic");      
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.LOG_PATH.getSwitch())) {
+      option.setLogPath(new Path(getVal(command, DistCpOptionSwitch.LOG_PATH.getSwitch())));
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.SYNC_FOLDERS.getSwitch())) {
+      option.setSyncFolder(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.OVERWRITE.getSwitch())) {
+      option.setOverwrite(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.DELETE_MISSING.getSwitch())) {
+      option.setDeleteMissing(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.SKIP_CRC.getSwitch())) {
+      option.setSkipCRC(true);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.BLOCKING.getSwitch())) {
+      option.setBlocking(false);
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.BANDWIDTH.getSwitch())) {
+      try {
+        Integer mapBandwidth = Integer.parseInt(
+            getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()).trim());
+        option.setMapBandwidth(mapBandwidth);
+      } catch (NumberFormatException e) {
+        throw new IllegalArgumentException("Bandwidth specified is invalid: " +
+            getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()), e);
+      }
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.SSL_CONF.getSwitch())) {
+      option.setSslConfigurationFile(command.
+          getOptionValue(DistCpOptionSwitch.SSL_CONF.getSwitch()));
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.MAX_MAPS.getSwitch())) {
+      try {
+        Integer maps = Integer.parseInt(
+            getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()).trim());
+        option.setMaxMaps(maps);
+      } catch (NumberFormatException e) {
+        throw new IllegalArgumentException("Number of maps is invalid: " +
+            getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()), e);
+      }
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.COPY_STRATEGY.getSwitch())) {
+      option.setCopyStrategy(
+            getVal(command, DistCpOptionSwitch.COPY_STRATEGY.getSwitch()));
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) {
+      String attributes =
+          getVal(command, DistCpOptionSwitch.PRESERVE_STATUS.getSwitch());
+      if (attributes == null || attributes.isEmpty()) {
+        for (FileAttribute attribute : FileAttribute.values()) {
+          option.preserve(attribute);
+        }
+      } else {
+        for (int index = 0; index < attributes.length(); index++) {
+          option.preserve(FileAttribute.
+              getAttribute(attributes.charAt(index)));
+        }
+      }
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.FILE_LIMIT.getSwitch())) {
+      String fileLimitString = getVal(command,
+                              DistCpOptionSwitch.FILE_LIMIT.getSwitch().trim());
+      try {
+        Integer.parseInt(fileLimitString);
+      }
+      catch (NumberFormatException e) {
+        throw new IllegalArgumentException("File-limit is invalid: "
+                                            + fileLimitString, e);
+      }
+      LOG.warn(DistCpOptionSwitch.FILE_LIMIT.getSwitch() + " is a deprecated" +
+              " option. Ignoring.");
+    }
+
+    if (command.hasOption(DistCpOptionSwitch.SIZE_LIMIT.getSwitch())) {
+      String sizeLimitString = getVal(command,
+                              DistCpOptionSwitch.SIZE_LIMIT.getSwitch().trim());
+      try {
+        Long.parseLong(sizeLimitString);
+      }
+      catch (NumberFormatException e) {
+        throw new IllegalArgumentException("Size-limit is invalid: "
+                                            + sizeLimitString, e);
+      }
+      LOG.warn(DistCpOptionSwitch.SIZE_LIMIT.getSwitch() + " is a deprecated" +
+              " option. Ignoring.");
+    }
+
+    return option;
+  }
+
+  private static String getVal(CommandLine command, String swtch) {
+    String optionValue = command.getOptionValue(swtch);
+    if (optionValue == null) {
+      return null;
+    } else {
+      return optionValue.trim();
+    }
+  }
+
+  public static void usage() {
+    HelpFormatter formatter = new HelpFormatter();
+    formatter.printHelp("distcp OPTIONS [source_path...] <target_path>\n\nOPTIONS", cliOptions);
+  }
+}

+ 275 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java

@@ -0,0 +1,275 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.mapreduce.security.TokenCache;
+import org.apache.hadoop.security.Credentials;
+
+import java.io.*;
+import java.util.Stack;
+
+/**
+ * The SimpleCopyListing is responsible for making the exhaustive list of
+ * all files/directories under its specified list of input-paths.
+ * These are written into the specified copy-listing file.
+ * Note: The SimpleCopyListing doesn't handle wild-cards in the input-paths.
+ */
+public class SimpleCopyListing extends CopyListing {
+  private static final Log LOG = LogFactory.getLog(SimpleCopyListing.class);
+
+  private long totalPaths = 0;
+  private long totalBytesToCopy = 0;
+
+  /**
+   * Protected constructor, to initialize configuration.
+   *
+   * @param configuration The input configuration, with which the source/target FileSystems may be accessed.
+   * @param credentials - Credentials object on which the FS delegation tokens are cached. If null
+   * delegation token caching is skipped
+   */
+  protected SimpleCopyListing(Configuration configuration, Credentials credentials) {
+    super(configuration, credentials);
+  }
+
+  @Override
+  protected void validatePaths(DistCpOptions options)
+      throws IOException, InvalidInputException {
+
+    Path targetPath = options.getTargetPath();
+    FileSystem targetFS = targetPath.getFileSystem(getConf());
+    boolean targetIsFile = targetFS.isFile(targetPath);
+
+    //If target is a file, then source has to be single file
+    if (targetIsFile) {
+      if (options.getSourcePaths().size() > 1) {
+        throw new InvalidInputException("Multiple source being copied to a file: " +
+            targetPath);
+      }
+
+      Path srcPath = options.getSourcePaths().get(0);
+      FileSystem sourceFS = srcPath.getFileSystem(getConf());
+      if (!sourceFS.isFile(srcPath)) {
+        throw new InvalidInputException("Cannot copy " + srcPath +
+            ", which is not a file to " + targetPath);
+      }
+    }
+
+    if (options.shouldAtomicCommit() && targetFS.exists(targetPath)) {
+      throw new InvalidInputException("Target path for atomic-commit already exists: " +
+        targetPath + ". Cannot atomic-commit to pre-existing target-path.");
+    }
+
+    for (Path path: options.getSourcePaths()) {
+      FileSystem fs = path.getFileSystem(getConf());
+      if (!fs.exists(path)) {
+        throw new InvalidInputException(path + " doesn't exist");
+      }
+    }
+
+    /* This is requires to allow map tasks to access each of the source
+       clusters. This would retrieve the delegation token for each unique
+       file system and add them to job's private credential store
+     */
+    Credentials credentials = getCredentials();
+    if (credentials != null) {
+      Path[] inputPaths = options.getSourcePaths().toArray(new Path[1]);
+      TokenCache.obtainTokensForNamenodes(credentials, inputPaths, getConf());
+    }
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException {
+
+    SequenceFile.Writer fileListWriter = null;
+
+    try {
+      fileListWriter = getWriter(pathToListingFile);
+
+      for (Path path: options.getSourcePaths()) {
+        FileSystem sourceFS = path.getFileSystem(getConf());
+        path = makeQualified(path);
+
+        FileStatus rootStatus = sourceFS.getFileStatus(path);
+        Path sourcePathRoot = computeSourceRootPath(rootStatus, options);
+        boolean localFile = (rootStatus.getClass() != FileStatus.class);
+
+        FileStatus[] sourceFiles = sourceFS.listStatus(path);
+        if (sourceFiles != null && sourceFiles.length > 0) {
+          for (FileStatus sourceStatus: sourceFiles) {
+            if (LOG.isDebugEnabled()) {
+              LOG.debug("Recording source-path: " + sourceStatus.getPath() + " for copy.");
+            }
+            writeToFileListing(fileListWriter, sourceStatus, sourcePathRoot, localFile);
+
+            if (isDirectoryAndNotEmpty(sourceFS, sourceStatus)) {
+              if (LOG.isDebugEnabled()) {
+                LOG.debug("Traversing non-empty source dir: " + sourceStatus.getPath());
+              }
+              traverseNonEmptyDirectory(fileListWriter, sourceStatus, sourcePathRoot, localFile);
+            }
+          }
+        } else {
+          writeToFileListing(fileListWriter, rootStatus, sourcePathRoot, localFile);
+        }
+      }
+    } finally {
+      IOUtils.closeStream(fileListWriter);
+    }
+  }
+
+  private Path computeSourceRootPath(FileStatus sourceStatus,
+                                     DistCpOptions options) throws IOException {
+
+    Path target = options.getTargetPath();
+    FileSystem targetFS = target.getFileSystem(getConf());
+
+    boolean solitaryFile = options.getSourcePaths().size() == 1
+                                                && !sourceStatus.isDirectory();
+
+    if (solitaryFile) {
+      if (targetFS.isFile(target) || !targetFS.exists(target)) {
+        return sourceStatus.getPath();
+      } else {
+        return sourceStatus.getPath().getParent();
+      }
+    } else {
+      boolean specialHandling = (options.getSourcePaths().size() == 1 && !targetFS.exists(target)) ||
+          options.shouldSyncFolder() || options.shouldOverwrite();
+
+      return specialHandling && sourceStatus.isDirectory() ? sourceStatus.getPath() :
+          sourceStatus.getPath().getParent();
+    }
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getBytesToCopy() {
+    return totalBytesToCopy;
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  protected long getNumberOfPaths() {
+    return totalPaths;
+  }
+
+  private Path makeQualified(Path path) throws IOException {
+    final FileSystem fs = path.getFileSystem(getConf());
+    return path.makeQualified(fs.getUri(), fs.getWorkingDirectory());
+  }
+
+  private SequenceFile.Writer getWriter(Path pathToListFile) throws IOException {
+    FileSystem fs = pathToListFile.getFileSystem(getConf());
+    if (fs.exists(pathToListFile)) {
+      fs.delete(pathToListFile, false);
+    }
+    return SequenceFile.createWriter(getConf(),
+            SequenceFile.Writer.file(pathToListFile),
+            SequenceFile.Writer.keyClass(Text.class),
+            SequenceFile.Writer.valueClass(FileStatus.class),
+            SequenceFile.Writer.compression(SequenceFile.CompressionType.NONE));
+  }
+
+  private static boolean isDirectoryAndNotEmpty(FileSystem fileSystem,
+                                    FileStatus fileStatus) throws IOException {
+    return fileStatus.isDirectory() && getChildren(fileSystem, fileStatus).length > 0;
+  }
+
+  private static FileStatus[] getChildren(FileSystem fileSystem,
+                                         FileStatus parent) throws IOException {
+    return fileSystem.listStatus(parent.getPath());
+  }
+
+  private void traverseNonEmptyDirectory(SequenceFile.Writer fileListWriter,
+                                         FileStatus sourceStatus,
+                                         Path sourcePathRoot, boolean localFile)
+                                         throws IOException {
+    FileSystem sourceFS = sourcePathRoot.getFileSystem(getConf());
+    Stack<FileStatus> pathStack = new Stack<FileStatus>();
+    pathStack.push(sourceStatus);
+
+    while (!pathStack.isEmpty()) {
+      for (FileStatus child: getChildren(sourceFS, pathStack.pop())) {
+        if (LOG.isDebugEnabled())
+          LOG.debug("Recording source-path: "
+                    + sourceStatus.getPath() + " for copy.");
+        writeToFileListing(fileListWriter, child, sourcePathRoot, localFile);
+        if (isDirectoryAndNotEmpty(sourceFS, child)) {
+          if (LOG.isDebugEnabled())
+            LOG.debug("Traversing non-empty source dir: "
+                       + sourceStatus.getPath());
+          pathStack.push(child);
+        }
+      }
+    }
+  }
+
+  private void writeToFileListing(SequenceFile.Writer fileListWriter,
+                                  FileStatus fileStatus, Path sourcePathRoot,
+                                  boolean localFile) throws IOException {
+    if (fileStatus.getPath().equals(sourcePathRoot) && fileStatus.isDirectory())
+      return; // Skip the root-paths.
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("REL PATH: " + DistCpUtils.getRelativePath(sourcePathRoot,
+        fileStatus.getPath()) + ", FULL PATH: " + fileStatus.getPath());
+    }
+
+    FileStatus status = fileStatus;
+    if (localFile) {
+      status = getFileStatus(fileStatus);
+    }
+
+    fileListWriter.append(new Text(DistCpUtils.getRelativePath(sourcePathRoot,
+        fileStatus.getPath())), status);
+    fileListWriter.sync();
+
+    if (!fileStatus.isDirectory()) {
+      totalBytesToCopy += fileStatus.getLen();
+    }
+    totalPaths++;
+  }
+
+  private static final ByteArrayOutputStream buffer = new ByteArrayOutputStream(64);
+  private DataInputBuffer in = new DataInputBuffer();
+  
+  private FileStatus getFileStatus(FileStatus fileStatus) throws IOException {
+    FileStatus status = new FileStatus();
+
+    buffer.reset();
+    DataOutputStream out = new DataOutputStream(buffer);
+    fileStatus.write(out);
+
+    in.reset(buffer.toByteArray(), 0, buffer.size());
+    status.readFields(in);
+    return status;
+  }
+}

+ 297 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java

@@ -0,0 +1,297 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
+import org.apache.hadoop.tools.*;
+import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+import org.apache.hadoop.tools.util.DistCpUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.List;
+
+/**
+ * The CopyCommitter class is DistCp's OutputCommitter implementation. It is
+ * responsible for handling the completion/cleanup of the DistCp run.
+ * Specifically, it does the following:
+ *  1. Cleanup of the meta-folder (where DistCp maintains its file-list, etc.)
+ *  2. Preservation of user/group/replication-factor on any directories that
+ *     have been copied. (Files are taken care of in their map-tasks.)
+ *  3. Atomic-move of data from the temporary work-folder to the final path
+ *     (if atomic-commit was opted for).
+ *  4. Deletion of files from the target that are missing at source (if opted for).
+ *  5. Cleanup of any partially copied files, from previous, failed attempts.
+ */
+public class CopyCommitter extends FileOutputCommitter {
+  private static final Log LOG = LogFactory.getLog(CopyCommitter.class);
+
+  private final TaskAttemptContext taskAttemptContext;
+
+  /**
+   * Create a output committer
+   *
+   * @param outputPath the job's output path
+   * @param context    the task's context
+   * @throws IOException - Exception if any
+   */
+  public CopyCommitter(Path outputPath, TaskAttemptContext context) throws IOException {
+    super(outputPath, context);
+    this.taskAttemptContext = context;
+  }
+
+  /** @inheritDoc */
+  @Override
+  public void commitJob(JobContext jobContext) throws IOException {
+    Configuration conf = jobContext.getConfiguration();
+    super.commitJob(jobContext);
+
+    cleanupTempFiles(jobContext);
+
+    String attributes = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
+    if (attributes != null && !attributes.isEmpty()) {
+      preserveFileAttributesForDirectories(conf);
+    }
+
+    try {
+      if (conf.getBoolean(DistCpConstants.CONF_LABEL_DELETE_MISSING, false)) {
+        deleteMissing(conf);
+      } else if (conf.getBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, false)) {
+        commitData(conf);
+      }
+      taskAttemptContext.setStatus("Commit Successful");
+    }
+    finally {
+      cleanup(conf);
+    }
+  }
+
+  /** @inheritDoc */
+  @Override
+  public void abortJob(JobContext jobContext,
+                       JobStatus.State state) throws IOException {
+    try {
+      super.abortJob(jobContext, state);
+    } finally {
+      cleanupTempFiles(jobContext);
+      cleanup(jobContext.getConfiguration());
+    }
+  }
+
+  private void cleanupTempFiles(JobContext context) {
+    try {
+      Configuration conf = context.getConfiguration();
+
+      Path targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+      FileSystem targetFS = targetWorkPath.getFileSystem(conf);
+
+      String jobId = context.getJobID().toString();
+      deleteAttemptTempFiles(targetWorkPath, targetFS, jobId);
+      deleteAttemptTempFiles(targetWorkPath.getParent(), targetFS, jobId);
+    } catch (Throwable t) {
+      LOG.warn("Unable to cleanup temp files", t);
+    }
+  }
+
+  private void deleteAttemptTempFiles(Path targetWorkPath,
+                                      FileSystem targetFS,
+                                      String jobId) throws IOException {
+
+    FileStatus[] tempFiles = targetFS.globStatus(
+        new Path(targetWorkPath, ".distcp.tmp." + jobId.replaceAll("job","attempt") + "*"));
+
+    if (tempFiles != null && tempFiles.length > 0) {
+      for (FileStatus file : tempFiles) {
+        LOG.info("Cleaning up " + file.getPath());
+        targetFS.delete(file.getPath(), false);
+      }
+    }
+  }
+
+  /**
+   * Cleanup meta folder and other temporary files
+   *
+   * @param conf - Job Configuration
+   */
+  private void cleanup(Configuration conf) {
+    Path metaFolder = new Path(conf.get(DistCpConstants.CONF_LABEL_META_FOLDER));
+    try {
+      FileSystem fs = metaFolder.getFileSystem(conf);
+      LOG.info("Cleaning up temporary work folder: " + metaFolder);
+      fs.delete(metaFolder, true);
+    } catch (IOException ignore) {
+      LOG.error("Exception encountered ", ignore);
+    }
+  }
+
+  // This method changes the target-directories' file-attributes (owner,
+  // user/group permissions, etc.) based on the corresponding source directories.
+  private void preserveFileAttributesForDirectories(Configuration conf) throws IOException {
+    String attrSymbols = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
+    LOG.info("About to preserve attributes: " + attrSymbols);
+
+    EnumSet<FileAttribute> attributes = DistCpUtils.unpackAttributes(attrSymbols);
+
+    Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
+    FileSystem clusterFS = sourceListing.getFileSystem(conf);
+    SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf,
+                                      SequenceFile.Reader.file(sourceListing));
+    long totalLen = clusterFS.getFileStatus(sourceListing).getLen();
+
+    Path targetRoot = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+
+    long preservedEntries = 0;
+    try {
+      FileStatus srcFileStatus = new FileStatus();
+      Text srcRelPath = new Text();
+
+      // Iterate over every source path that was copied.
+      while (sourceReader.next(srcRelPath, srcFileStatus)) {
+        // File-attributes for files are set at the time of copy,
+        // in the map-task.
+        if (! srcFileStatus.isDirectory()) continue;
+
+        Path targetFile = new Path(targetRoot.toString() + "/" + srcRelPath);
+
+        // Skip the root folder.
+        // Status can't be preserved on root-folder. (E.g. multiple paths may
+        // be copied to a single target folder. Which source-attributes to use
+        // on the target is undefined.)
+        if (targetRoot.equals(targetFile)) continue;
+
+        FileSystem targetFS = targetFile.getFileSystem(conf);
+        DistCpUtils.preserve(targetFS, targetFile, srcFileStatus,  attributes);
+
+        taskAttemptContext.progress();
+        taskAttemptContext.setStatus("Preserving status on directory entries. [" +
+            sourceReader.getPosition() * 100 / totalLen + "%]");
+      }
+    } finally {
+      IOUtils.closeStream(sourceReader);
+    }
+    LOG.info("Preserved status on " + preservedEntries + " dir entries on target");
+  }
+
+  // This method deletes "extra" files from the target, if they're not
+  // available at the source.
+  private void deleteMissing(Configuration conf) throws IOException {
+    LOG.info("-delete option is enabled. About to remove entries from " +
+        "target that are missing in source");
+
+    // Sort the source-file listing alphabetically.
+    Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
+    FileSystem clusterFS = sourceListing.getFileSystem(conf);
+    Path sortedSourceListing = DistCpUtils.sortListing(clusterFS, conf, sourceListing);
+
+    // Similarly, create the listing of target-files. Sort alphabetically.
+    Path targetListing = new Path(sourceListing.getParent(), "targetListing.seq");
+    CopyListing target = new GlobbedCopyListing(new Configuration(conf), null);
+
+    List<Path> targets = new ArrayList<Path>(1);
+    Path targetFinalPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
+    targets.add(targetFinalPath);
+    DistCpOptions options = new DistCpOptions(targets, new Path("/NONE"));
+
+    target.buildListing(targetListing, options);
+    Path sortedTargetListing = DistCpUtils.sortListing(clusterFS, conf, targetListing);
+    long totalLen = clusterFS.getFileStatus(sortedTargetListing).getLen();
+
+    SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf,
+                                 SequenceFile.Reader.file(sortedSourceListing));
+    SequenceFile.Reader targetReader = new SequenceFile.Reader(conf,
+                                 SequenceFile.Reader.file(sortedTargetListing));
+
+    // Walk both source and target file listings.
+    // Delete all from target that doesn't also exist on source.
+    long deletedEntries = 0;
+    try {
+      FileStatus srcFileStatus = new FileStatus();
+      Text srcRelPath = new Text();
+      FileStatus trgtFileStatus = new FileStatus();
+      Text trgtRelPath = new Text();
+
+      FileSystem targetFS = targetFinalPath.getFileSystem(conf);
+      boolean srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
+      while (targetReader.next(trgtRelPath, trgtFileStatus)) {
+        // Skip sources that don't exist on target.
+        while (srcAvailable && trgtRelPath.compareTo(srcRelPath) > 0) {
+          srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
+        }
+
+        if (srcAvailable && trgtRelPath.equals(srcRelPath)) continue;
+
+        // Target doesn't exist at source. Delete.
+        boolean result = (!targetFS.exists(trgtFileStatus.getPath()) ||
+            targetFS.delete(trgtFileStatus.getPath(), true));
+        if (result) {
+          LOG.info("Deleted " + trgtFileStatus.getPath() + " - Missing at source");
+          deletedEntries++;
+        } else {
+          throw new IOException("Unable to delete " + trgtFileStatus.getPath());
+        }
+        taskAttemptContext.progress();
+        taskAttemptContext.setStatus("Deleting missing files from target. [" +
+            targetReader.getPosition() * 100 / totalLen + "%]");
+      }
+    } finally {
+      IOUtils.closeStream(sourceReader);
+      IOUtils.closeStream(targetReader);
+    }
+    LOG.info("Deleted " + deletedEntries + " from target: " + targets.get(0));
+  }
+
+  private void commitData(Configuration conf) throws IOException {
+
+    Path workDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+    Path finalDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
+    FileSystem targetFS = workDir.getFileSystem(conf);
+
+    LOG.info("Atomic commit enabled. Moving " + workDir + " to " + finalDir);
+    if (targetFS.exists(finalDir) && targetFS.exists(workDir)) {
+      LOG.error("Pre-existing final-path found at: " + finalDir);
+      throw new IOException("Target-path can't be committed to because it " +
+          "exists at " + finalDir + ". Copied data is in temp-dir: " + workDir + ". ");
+    }
+
+    boolean result = targetFS.rename(workDir, finalDir);
+    if (!result) {
+      LOG.warn("Rename failed. Perhaps data already moved. Verifying...");
+      result = targetFS.exists(finalDir) && !targetFS.exists(workDir);
+    }
+    if (result) {
+      LOG.info("Data committed successfully to " + finalDir);
+      taskAttemptContext.setStatus("Data committed successfully to " + finalDir);
+    } else {
+      LOG.error("Unable to commit data to " + finalDir);
+      throw new IOException("Atomic commit failed. Temporary data in " + workDir +
+        ", Unable to move to " + finalDir);
+    }
+  }
+}

+ 330 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyMapper.java

@@ -0,0 +1,330 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.DistCpOptionSwitch;
+import org.apache.hadoop.tools.DistCpOptions;
+import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.util.StringUtils;
+
+import java.io.*;
+import java.util.EnumSet;
+import java.util.Arrays;
+
+/**
+ * Mapper class that executes the DistCp copy operation.
+ * Implements the o.a.h.mapreduce.Mapper<> interface.
+ */
+public class CopyMapper extends Mapper<Text, FileStatus, Text, Text> {
+
+  /**
+   * Hadoop counters for the DistCp CopyMapper.
+   * (These have been kept identical to the old DistCp,
+   * for backward compatibility.)
+   */
+  public static enum Counter {
+    COPY,         // Number of files received by the mapper for copy.
+    SKIP,         // Number of files skipped.
+    FAIL,         // Number of files that failed to be copied.
+    BYTESCOPIED,  // Number of bytes actually copied by the copy-mapper, total.
+    BYTESEXPECTED,// Number of bytes expected to be copied.
+    BYTESFAILED,  // Number of bytes that failed to be copied.
+    BYTESSKIPPED, // Number of bytes that were skipped from copy.
+  }
+
+  private static Log LOG = LogFactory.getLog(CopyMapper.class);
+
+  private Configuration conf;
+
+  private boolean syncFolders = false;
+  private boolean ignoreFailures = false;
+  private boolean skipCrc = false;
+  private boolean overWrite = false;
+  private EnumSet<FileAttribute> preserve = EnumSet.noneOf(FileAttribute.class);
+
+  private FileSystem targetFS = null;
+  private Path    targetWorkPath = null;
+
+  /**
+   * Implementation of the Mapper::setup() method. This extracts the DistCp-
+   * options specified in the Job's configuration, to set up the Job.
+   * @param context Mapper's context.
+   * @throws IOException On IO failure.
+   * @throws InterruptedException If the job is interrupted.
+   */
+  @Override
+  public void setup(Context context) throws IOException, InterruptedException {
+    conf = context.getConfiguration();
+
+    syncFolders = conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false);
+    ignoreFailures = conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false);
+    skipCrc = conf.getBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(), false);
+    overWrite = conf.getBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(), false);
+    preserve = DistCpUtils.unpackAttributes(conf.get(DistCpOptionSwitch.
+        PRESERVE_STATUS.getConfigLabel()));
+
+    targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+    Path targetFinalPath = new Path(conf.get(
+            DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
+    targetFS = targetFinalPath.getFileSystem(conf);
+
+    if (targetFS.exists(targetFinalPath) && targetFS.isFile(targetFinalPath)) {
+      overWrite = true; // When target is an existing file, overwrite it.
+    }
+
+    if (conf.get(DistCpConstants.CONF_LABEL_SSL_CONF) != null) {
+      initializeSSLConf(context);
+    }
+  }
+
+  /**
+   * Initialize SSL Config if same is set in conf
+   *
+   * @throws IOException - If any
+   */
+  private void initializeSSLConf(Context context) throws IOException {
+    LOG.info("Initializing SSL configuration");
+    
+    String workDir = conf.get(JobContext.JOB_LOCAL_DIR) + "/work";
+    Path[] cacheFiles = context.getLocalCacheFiles();
+
+    Configuration sslConfig = new Configuration(false);
+    String sslConfFileName = conf.get(DistCpConstants.CONF_LABEL_SSL_CONF);
+    Path sslClient = findCacheFile(cacheFiles, sslConfFileName);
+    if (sslClient == null) {
+      LOG.warn("SSL Client config file not found. Was looking for " + sslConfFileName +
+          " in " + Arrays.toString(cacheFiles));
+      return;
+    }
+    sslConfig.addResource(sslClient);
+
+    String trustStoreFile = conf.get("ssl.client.truststore.location");
+    Path trustStorePath = findCacheFile(cacheFiles, trustStoreFile);
+    sslConfig.set("ssl.client.truststore.location", trustStorePath.toString());
+
+    String keyStoreFile = conf.get("ssl.client.keystore.location");
+    Path keyStorePath = findCacheFile(cacheFiles, keyStoreFile);
+    sslConfig.set("ssl.client.keystore.location", keyStorePath.toString());
+
+    try {
+      OutputStream out = new FileOutputStream(workDir + "/" + sslConfFileName);
+      try {
+        sslConfig.writeXml(out);
+      } finally {
+        out.close();
+      }
+      conf.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfFileName);
+    } catch (IOException e) {
+      LOG.warn("Unable to write out the ssl configuration. " +
+          "Will fall back to default ssl-client.xml in class path, if there is one", e);
+    }
+  }
+
+  /**
+   * Find entry from distributed cache
+   *
+   * @param cacheFiles - All localized cache files
+   * @param fileName - fileName to search
+   * @return Path of the filename if found, else null
+   */
+  private Path findCacheFile(Path[] cacheFiles, String fileName) {
+    if (cacheFiles != null && cacheFiles.length > 0) {
+      for (Path file : cacheFiles) {
+        if (file.getName().equals(fileName)) {
+          return file;
+        }
+      }
+    }
+    return null;
+  }
+
+  /**
+   * Implementation of the Mapper<>::map(). Does the copy.
+   * @param relPath The target path.
+   * @param sourceFileStatus The source path.
+   * @throws IOException
+   */
+  @Override
+  public void map(Text relPath, FileStatus sourceFileStatus, Context context)
+          throws IOException, InterruptedException {
+    Path sourcePath = sourceFileStatus.getPath();
+
+    if (LOG.isDebugEnabled())
+      LOG.debug("DistCpMapper::map(): Received " + sourcePath + ", " + relPath);
+
+    Path target = new Path(targetWorkPath.makeQualified(targetFS.getUri(),
+                          targetFS.getWorkingDirectory()) + relPath.toString());
+
+    EnumSet<DistCpOptions.FileAttribute> fileAttributes
+            = getFileAttributeSettings(context);
+
+    final String description = "Copying " + sourcePath + " to " + target;
+    context.setStatus(description);
+
+    LOG.info(description);
+
+    try {
+      FileStatus sourceCurrStatus;
+      FileSystem sourceFS;
+      try {
+        sourceFS = sourcePath.getFileSystem(conf);
+        sourceCurrStatus = sourceFS.getFileStatus(sourcePath);
+      } catch (FileNotFoundException e) {
+        throw new IOException(new RetriableFileCopyCommand.CopyReadException(e));
+      }
+
+      FileStatus targetStatus = null;
+
+      try {
+        targetStatus = targetFS.getFileStatus(target);
+      } catch (FileNotFoundException ignore) {
+        if (LOG.isDebugEnabled())
+          LOG.debug("Path could not be found: " + target, ignore);
+      }
+
+      if (targetStatus != null && (targetStatus.isDirectory() != sourceCurrStatus.isDirectory())) {
+        throw new IOException("Can't replace " + target + ". Target is " +
+            getFileType(targetStatus) + ", Source is " + getFileType(sourceCurrStatus));
+      }
+
+      if (sourceCurrStatus.isDirectory()) {
+        createTargetDirsWithRetry(description, target, context);
+        return;
+      }
+
+      if (skipFile(sourceFS, sourceCurrStatus, target)) {
+        LOG.info("Skipping copy of " + sourceCurrStatus.getPath()
+                 + " to " + target);
+        updateSkipCounters(context, sourceCurrStatus);
+        context.write(null, new Text("SKIP: " + sourceCurrStatus.getPath()));
+      }
+      else {
+        copyFileWithRetry(description, sourceCurrStatus, target, context,
+                          fileAttributes);
+      }
+
+      DistCpUtils.preserve(target.getFileSystem(conf), target,
+                           sourceCurrStatus, fileAttributes);
+
+    } catch (IOException exception) {
+      handleFailures(exception, sourceFileStatus, target, context);
+    }
+  }
+
+  private String getFileType(FileStatus fileStatus) {
+    return fileStatus == null ? "N/A" : (fileStatus.isDirectory() ? "dir" : "file");
+  }
+
+  private static EnumSet<DistCpOptions.FileAttribute>
+          getFileAttributeSettings(Mapper.Context context) {
+    String attributeString = context.getConfiguration().get(
+            DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel());
+    return DistCpUtils.unpackAttributes(attributeString);
+  }
+
+  private void copyFileWithRetry(String description, FileStatus sourceFileStatus,
+               Path target, Context context,
+               EnumSet<DistCpOptions.FileAttribute> fileAttributes) throws IOException {
+
+    long bytesCopied;
+    try {
+      bytesCopied = (Long)new RetriableFileCopyCommand(description)
+                       .execute(sourceFileStatus, target, context, fileAttributes);
+    } catch (Exception e) {
+      context.setStatus("Copy Failure: " + sourceFileStatus.getPath());
+      throw new IOException("File copy failed: " + sourceFileStatus.getPath() +
+          " --> " + target, e);
+    }
+    incrementCounter(context, Counter.BYTESEXPECTED, sourceFileStatus.getLen());
+    incrementCounter(context, Counter.BYTESCOPIED, bytesCopied);
+    incrementCounter(context, Counter.COPY, 1);
+  }
+
+  private void createTargetDirsWithRetry(String description,
+                   Path target, Context context) throws IOException {
+    try {
+      new RetriableDirectoryCreateCommand(description).execute(target, context);
+    } catch (Exception e) {
+      throw new IOException("mkdir failed for " + target, e);
+    }
+    incrementCounter(context, Counter.COPY, 1);
+  }
+
+  private static void updateSkipCounters(Context context,
+                                         FileStatus sourceFile) {
+    incrementCounter(context, Counter.SKIP, 1);
+    incrementCounter(context, Counter.BYTESSKIPPED, sourceFile.getLen());
+
+  }
+
+  private void handleFailures(IOException exception,
+                                     FileStatus sourceFileStatus, Path target,
+                                     Context context) throws IOException, InterruptedException {
+    LOG.error("Failure in copying " + sourceFileStatus.getPath() + " to " +
+                target, exception);
+
+    if (ignoreFailures && exception.getCause() instanceof
+            RetriableFileCopyCommand.CopyReadException) {
+      incrementCounter(context, Counter.FAIL, 1);
+      incrementCounter(context, Counter.BYTESFAILED, sourceFileStatus.getLen());
+      context.write(null, new Text("FAIL: " + sourceFileStatus.getPath() + " - " + 
+          StringUtils.stringifyException(exception)));
+    }
+    else
+      throw exception;
+  }
+
+  private static void incrementCounter(Context context, Counter counter,
+                                       long value) {
+    context.getCounter(counter).increment(value);
+  }
+
+  private boolean skipFile(FileSystem sourceFS, FileStatus source, Path target)
+                                          throws IOException {
+    return     targetFS.exists(target)
+            && !overWrite
+            && !mustUpdate(sourceFS, source, target);
+  }
+
+  private boolean mustUpdate(FileSystem sourceFS, FileStatus source, Path target)
+                                    throws IOException {
+    final FileStatus targetFileStatus = targetFS.getFileStatus(target);
+
+    return     syncFolders
+            && (
+                   targetFileStatus.getLen() != source.getLen()
+                || (!skipCrc &&
+                       !DistCpUtils.checksumsAreEqual(sourceFS,
+                                          source.getPath(), targetFS, target))
+                || (source.getBlockSize() != targetFileStatus.getBlockSize() &&
+                      preserve.contains(FileAttribute.BLOCKSIZE))
+               );
+  }
+}

+ 124 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyOutputFormat.java

@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.security.TokenCache;
+import org.apache.hadoop.tools.DistCpConstants;
+
+import java.io.IOException;
+
+/**
+ * The CopyOutputFormat is the Hadoop OutputFormat used in DistCp.
+ * It sets up the Job's Configuration (in the Job-Context) with the settings
+ * for the work-directory, final commit-directory, etc. It also sets the right
+ * output-committer.
+ * @param <K>
+ * @param <V>
+ */
+public class CopyOutputFormat<K, V> extends TextOutputFormat<K, V> {
+
+  /**
+   * Setter for the working directory for DistCp (where files will be copied
+   * before they are moved to the final commit-directory.)
+   * @param job The Job on whose configuration the working-directory is to be set.
+   * @param workingDirectory The path to use as the working directory.
+   */
+  public static void setWorkingDirectory(Job job, Path workingDirectory) {
+    job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH,
+        workingDirectory.toString());
+  }
+
+  /**
+   * Setter for the final directory for DistCp (where files copied will be
+   * moved, atomically.)
+   * @param job The Job on whose configuration the working-directory is to be set.
+   * @param commitDirectory The path to use for final commit.
+   */
+  public static void setCommitDirectory(Job job, Path commitDirectory) {
+    job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH,
+        commitDirectory.toString());
+  }
+
+  /**
+   * Getter for the working directory.
+   * @param job The Job from whose configuration the working-directory is to
+   * be retrieved.
+   * @return The working-directory Path.
+   */
+  public static Path getWorkingDirectory(Job job) {
+    return getWorkingDirectory(job.getConfiguration());
+  }
+
+  private static Path getWorkingDirectory(Configuration conf) {
+    String workingDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH);
+    if (workingDirectory == null || workingDirectory.isEmpty()) {
+      return null;
+    } else {
+      return new Path(workingDirectory);
+    }
+  }
+
+  /**
+   * Getter for the final commit-directory.
+   * @param job The Job from whose configuration the commit-directory is to be
+   * retrieved.
+   * @return The commit-directory Path.
+   */
+  public static Path getCommitDirectory(Job job) {
+    return getCommitDirectory(job.getConfiguration());
+  }
+
+  private static Path getCommitDirectory(Configuration conf) {
+    String commitDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH);
+    if (commitDirectory == null || commitDirectory.isEmpty()) {
+      return null;
+    } else {
+      return new Path(commitDirectory);
+    }
+  }
+
+  /** @inheritDoc */
+  @Override
+  public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
+    return new CopyCommitter(getOutputPath(context), context);
+  }
+
+  /** @inheritDoc */
+  @Override
+  public void checkOutputSpecs(JobContext context) throws IOException {
+    Configuration conf = context.getConfiguration();
+
+    if (getCommitDirectory(conf) == null) {
+      throw new IllegalStateException("Commit directory not configured");
+    }
+
+    Path workingPath = getWorkingDirectory(conf);
+    if (workingPath == null) {
+      throw new IllegalStateException("Working directory not configured");
+    }
+
+    // get delegation token for outDir's file system
+    TokenCache.obtainTokensForNamenodes(context.getCredentials(),
+                                        new Path[] {workingPath}, conf);
+  }
+}

+ 56 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableDirectoryCreateCommand.java

@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.hadoop.tools.util.RetriableCommand;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.mapreduce.Mapper;
+
+/**
+ * This class extends Retriable command to implement the creation of directories
+ * with retries on failure.
+ */
+public class RetriableDirectoryCreateCommand extends RetriableCommand {
+
+  /**
+   * Constructor, taking a description of the action.
+   * @param description Verbose description of the copy operation.
+   */
+  public RetriableDirectoryCreateCommand(String description) {
+    super(description);
+  }
+
+  /**
+   * Implementation of RetriableCommand::doExecute().
+   * This implements the actual mkdirs() functionality.
+   * @param arguments Argument-list to the command.
+   * @return Boolean. True, if the directory could be created successfully.
+   * @throws Exception IOException, on failure to create the directory.
+   */
+  @Override
+  protected Object doExecute(Object... arguments) throws Exception {
+    assert arguments.length == 2 : "Unexpected argument list.";
+    Path target = (Path)arguments[0];
+    Mapper.Context context = (Mapper.Context)arguments[1];
+
+    FileSystem targetFS = target.getFileSystem(context.getConfiguration());
+    return targetFS.mkdirs(target);
+  }
+}

+ 245 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java

@@ -0,0 +1,245 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.hadoop.tools.util.RetriableCommand;
+import org.apache.hadoop.tools.util.ThrottledInputStream;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.tools.DistCpOptions.*;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.io.*;
+import java.util.EnumSet;
+
+/**
+ * This class extends RetriableCommand to implement the copy of files,
+ * with retries on failure.
+ */
+public class RetriableFileCopyCommand extends RetriableCommand {
+
+  private static Log LOG = LogFactory.getLog(RetriableFileCopyCommand.class);
+  private static int BUFFER_SIZE = 8 * 1024;
+
+  /**
+   * Constructor, taking a description of the action.
+   * @param description Verbose description of the copy operation.
+   */
+  public RetriableFileCopyCommand(String description) {
+    super(description);
+  }
+
+  /**
+   * Implementation of RetriableCommand::doExecute().
+   * This is the actual copy-implementation.
+   * @param arguments Argument-list to the command.
+   * @return Number of bytes copied.
+   * @throws Exception: CopyReadException, if there are read-failures. All other
+   *         failures are IOExceptions.
+   */
+  @SuppressWarnings("unchecked")
+  @Override
+  protected Object doExecute(Object... arguments) throws Exception {
+    assert arguments.length == 4 : "Unexpected argument list.";
+    FileStatus source = (FileStatus)arguments[0];
+    assert !source.isDirectory() : "Unexpected file-status. Expected file.";
+    Path target = (Path)arguments[1];
+    Mapper.Context context = (Mapper.Context)arguments[2];
+    EnumSet<FileAttribute> fileAttributes
+            = (EnumSet<FileAttribute>)arguments[3];
+    return doCopy(source, target, context, fileAttributes);
+  }
+
+  private long doCopy(FileStatus sourceFileStatus, Path target,
+                      Mapper.Context context,
+                      EnumSet<FileAttribute> fileAttributes)
+          throws IOException {
+
+    Path tmpTargetPath = getTmpFile(target, context);
+    final Configuration configuration = context.getConfiguration();
+    FileSystem targetFS = target.getFileSystem(configuration);
+
+    try {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Copying " + sourceFileStatus.getPath() + " to " + target);
+        LOG.debug("Tmp-file path: " + tmpTargetPath);
+      }
+      FileSystem sourceFS = sourceFileStatus.getPath().getFileSystem(
+              configuration);
+      long bytesRead = copyToTmpFile(tmpTargetPath, targetFS, sourceFileStatus,
+                                     context, fileAttributes);
+
+      compareFileLengths(sourceFileStatus, tmpTargetPath, configuration, bytesRead);
+      compareCheckSums(sourceFS, sourceFileStatus.getPath(), targetFS, tmpTargetPath);
+      promoteTmpToTarget(tmpTargetPath, target, targetFS);
+      return bytesRead;
+
+    } finally {
+      if (targetFS.exists(tmpTargetPath))
+        targetFS.delete(tmpTargetPath, false);
+    }
+  }
+
+  private long copyToTmpFile(Path tmpTargetPath, FileSystem targetFS,
+                             FileStatus sourceFileStatus, Mapper.Context context,
+                             EnumSet<FileAttribute> fileAttributes)
+                             throws IOException {
+    OutputStream outStream = new BufferedOutputStream(targetFS.create(
+            tmpTargetPath, true, BUFFER_SIZE,
+            getReplicationFactor(fileAttributes, sourceFileStatus, targetFS),
+            getBlockSize(fileAttributes, sourceFileStatus, targetFS), context));
+    return copyBytes(sourceFileStatus, outStream, BUFFER_SIZE, true, context);
+  }
+
+  private void compareFileLengths(FileStatus sourceFileStatus, Path target,
+                                  Configuration configuration, long bytesRead)
+                                  throws IOException {
+    final Path sourcePath = sourceFileStatus.getPath();
+    FileSystem fs = sourcePath.getFileSystem(configuration);
+    if (fs.getFileStatus(sourcePath).getLen() != bytesRead)
+      throw new IOException("Mismatch in length of source:" + sourcePath
+                + " and target:" + target);
+  }
+
+  private void compareCheckSums(FileSystem sourceFS, Path source,
+                                FileSystem targetFS, Path target)
+                                throws IOException {
+    if (!DistCpUtils.checksumsAreEqual(sourceFS, source, targetFS, target))
+      throw new IOException("Check-sum mismatch between "
+                              + source + " and " + target);
+
+  }
+
+  //If target file exists and unable to delete target - fail
+  //If target doesn't exist and unable to create parent folder - fail
+  //If target is successfully deleted and parent exists, if rename fails - fail
+  private void promoteTmpToTarget(Path tmpTarget, Path target, FileSystem fs)
+                                  throws IOException {
+    if ((fs.exists(target) && !fs.delete(target, false))
+        || (!fs.exists(target.getParent()) && !fs.mkdirs(target.getParent()))
+        || !fs.rename(tmpTarget, target)) {
+      throw new IOException("Failed to promote tmp-file:" + tmpTarget
+                              + " to: " + target);
+    }
+  }
+
+  private Path getTmpFile(Path target, Mapper.Context context) {
+    Path targetWorkPath = new Path(context.getConfiguration().
+        get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
+
+    Path root = target.equals(targetWorkPath)? targetWorkPath.getParent() : targetWorkPath;
+    LOG.info("Creating temp file: " +
+        new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString()));
+    return new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString());
+  }
+
+  private long copyBytes(FileStatus sourceFileStatus, OutputStream outStream,
+                         int bufferSize, boolean mustCloseStream,
+                         Mapper.Context context) throws IOException {
+    Path source = sourceFileStatus.getPath();
+    byte buf[] = new byte[bufferSize];
+    ThrottledInputStream inStream = null;
+    long totalBytesRead = 0;
+
+    try {
+      inStream = getInputStream(source, context.getConfiguration());
+      int bytesRead = readBytes(inStream, buf);
+      while (bytesRead >= 0) {
+        totalBytesRead += bytesRead;
+        outStream.write(buf, 0, bytesRead);
+        updateContextStatus(totalBytesRead, context, sourceFileStatus);
+        bytesRead = inStream.read(buf);
+      }
+    } finally {
+      if (mustCloseStream)
+        IOUtils.cleanup(LOG, outStream, inStream);
+    }
+
+    return totalBytesRead;
+  }
+
+  private void updateContextStatus(long totalBytesRead, Mapper.Context context,
+                                   FileStatus sourceFileStatus) {
+    StringBuilder message = new StringBuilder(DistCpUtils.getFormatter()
+                .format(totalBytesRead * 100.0f / sourceFileStatus.getLen()));
+    message.append("% ")
+            .append(description).append(" [")
+            .append(DistCpUtils.getStringDescriptionFor(totalBytesRead))
+            .append('/')
+        .append(DistCpUtils.getStringDescriptionFor(sourceFileStatus.getLen()))
+            .append(']');
+    context.setStatus(message.toString());
+  }
+
+  private static int readBytes(InputStream inStream, byte buf[])
+          throws IOException {
+    try {
+      return inStream.read(buf);
+    }
+    catch (IOException e) {
+      throw new CopyReadException(e);
+    }
+  }
+
+  private static ThrottledInputStream getInputStream(Path path, Configuration conf)
+          throws IOException {
+    try {
+      FileSystem fs = path.getFileSystem(conf);
+      long bandwidthMB = conf.getInt(DistCpConstants.CONF_LABEL_BANDWIDTH_MB,
+              DistCpConstants.DEFAULT_BANDWIDTH_MB);
+      return new ThrottledInputStream(new BufferedInputStream(fs.open(path)),
+              bandwidthMB * 1024 * 1024);
+    }
+    catch (IOException e) {
+      throw new CopyReadException(e);
+    }
+  }
+
+  private static short getReplicationFactor(
+          EnumSet<FileAttribute> fileAttributes,
+          FileStatus sourceFile, FileSystem targetFS) {
+    return fileAttributes.contains(FileAttribute.REPLICATION)?
+            sourceFile.getReplication() : targetFS.getDefaultReplication();
+  }
+
+  private static long getBlockSize(
+          EnumSet<FileAttribute> fileAttributes,
+          FileStatus sourceFile, FileSystem targetFS) {
+    return fileAttributes.contains(FileAttribute.BLOCKSIZE)?
+            sourceFile.getBlockSize() : targetFS.getDefaultBlockSize();
+  }
+
+  /**
+   * Special subclass of IOException. This is used to distinguish read-operation
+   * failures from other kinds of IOExceptions.
+   * The failure to read from source is dealt with specially, in the CopyMapper.
+   * Such failures may be skipped if the DistCpOptions indicate so.
+   * Write failures are intolerable, and amount to CopyMapper failure.  
+   */
+  public static class CopyReadException extends IOException {
+    public CopyReadException(Throwable rootCause) {
+      super(rootCause);
+    }
+  }
+}

+ 169 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/UniformSizeInputFormat.java

@@ -0,0 +1,169 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.mapred;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.conf.Configuration;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * UniformSizeInputFormat extends the InputFormat<> class, to produce
+ * input-splits for DistCp.
+ * It looks at the copy-listing and groups the contents into input-splits such
+ * that the total-number of bytes to be copied for each input split is
+ * uniform.
+ */
+public class UniformSizeInputFormat extends InputFormat<Text, FileStatus> {
+  private static final Log LOG
+                = LogFactory.getLog(UniformSizeInputFormat.class);
+
+  /**
+   * Implementation of InputFormat::getSplits(). Returns a list of InputSplits,
+   * such that the number of bytes to be copied for all the splits are
+   * approximately equal.
+   * @param context JobContext for the job.
+   * @return The list of uniformly-distributed input-splits.
+   * @throws IOException: On failure.
+   * @throws InterruptedException
+   */
+  @Override
+  public List<InputSplit> getSplits(JobContext context)
+                      throws IOException, InterruptedException {
+    Configuration configuration = context.getConfiguration();
+    int numSplits = DistCpUtils.getInt(configuration,
+                                       JobContext.NUM_MAPS);
+
+    if (numSplits == 0) return new ArrayList<InputSplit>();
+
+    return getSplits(configuration, numSplits,
+                     DistCpUtils.getLong(configuration,
+                          DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED));
+  }
+
+  private List<InputSplit> getSplits(Configuration configuration, int numSplits,
+                                     long totalSizeBytes) throws IOException {
+    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
+    long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);
+
+    FileStatus srcFileStatus = new FileStatus();
+    Text srcRelPath = new Text();
+    long currentSplitSize = 0;
+    long lastSplitStart = 0;
+    long lastPosition = 0;
+
+    final Path listingFilePath = getListingFilePath(configuration);
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Average bytes per map: " + nBytesPerSplit +
+          ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes);
+    }
+    SequenceFile.Reader reader=null;
+    try {
+      reader = getListingFileReader(configuration);
+      while (reader.next(srcRelPath, srcFileStatus)) {
+        // If adding the current file would cause the bytes per map to exceed
+        // limit. Add the current file to new split
+        if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
+          FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
+              lastPosition - lastSplitStart, null);
+          if (LOG.isDebugEnabled()) {
+            LOG.debug ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
+          }
+          splits.add(split);
+          lastSplitStart = lastPosition;
+          currentSplitSize = 0;
+        }
+        currentSplitSize += srcFileStatus.getLen();
+        lastPosition = reader.getPosition();
+      }
+      if (lastPosition > lastSplitStart) {
+        FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
+            lastPosition - lastSplitStart, null);
+        if (LOG.isDebugEnabled()) {
+          LOG.info ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
+        }
+        splits.add(split);
+      }
+
+    } finally {
+      IOUtils.closeStream(reader);
+    }
+
+    return splits;
+  }
+
+  private static Path getListingFilePath(Configuration configuration) {
+    final String listingFilePathString =
+            configuration.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
+
+    assert !listingFilePathString.equals("")
+              : "Couldn't find listing file. Invalid input.";
+    return new Path(listingFilePathString);
+  }
+
+  private SequenceFile.Reader getListingFileReader(Configuration configuration) {
+
+    final Path listingFilePath = getListingFilePath(configuration);
+    try {
+      final FileSystem fileSystem = listingFilePath.getFileSystem(configuration);
+      if (!fileSystem.exists(listingFilePath))
+        throw new IllegalArgumentException("Listing file doesn't exist at: "
+                                           + listingFilePath);
+
+      return new SequenceFile.Reader(configuration,
+                                     SequenceFile.Reader.file(listingFilePath));
+    }
+    catch (IOException exception) {
+      LOG.error("Couldn't find listing file at: " + listingFilePath, exception);
+      throw new IllegalArgumentException("Couldn't find listing-file at: "
+                                         + listingFilePath, exception);
+    }
+  }
+
+  /**
+   * Implementation of InputFormat::createRecordReader().
+   * @param split The split for which the RecordReader is sought.
+   * @param context The context of the current task-attempt.
+   * @return A SequenceFileRecordReader instance, (since the copy-listing is a
+   * simple sequence-file.)
+   * @throws IOException
+   * @throws InterruptedException
+   */
+  @Override
+  public RecordReader<Text, FileStatus> createRecordReader(InputSplit split,
+                                                     TaskAttemptContext context)
+                                      throws IOException, InterruptedException {
+    return new SequenceFileRecordReader<Text, FileStatus>();
+  }
+}

+ 246 - 0
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputChunk.java

@@ -0,0 +1,246 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.tools.mapred.lib;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.util.DistCpUtils;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskID;
+
+import java.io.IOException;
+
+/**
+ * The DynamicInputChunk represents a single chunk of work, when used in
+ * conjunction with the DynamicInputFormat and the DynamicRecordReader.
+ * The records in the DynamicInputFormat's input-file are split across various
+ * DynamicInputChunks. Each one is claimed and processed in an iteration of
+ * a dynamic-mapper. When a DynamicInputChunk has been exhausted, the faster
+ * mapper may claim another and process it, until there are no more to be
+ * consumed.
+ */
+class DynamicInputChunk<K, V> {
+  private static Log LOG = LogFactory.getLog(DynamicInputChunk.class);
+
+  private static Configuration configuration;
+  private static Path chunkRootPath;
+  private static String chunkFilePrefix;
+  private static int numChunksLeft = -1; // Un-initialized before 1st dir-scan.
+  private static FileSystem fs;
+
+  private Path chunkFilePath;
+  private SequenceFileRecordReader<K, V> reader;
+  private SequenceFile.Writer writer;
+
+  private static void initializeChunkInvariants(Configuration config)
+                                                  throws IOException {
+    configuration = config;
+    Path listingFilePath = new Path(getListingFilePath(configuration));
+    chunkRootPath = new Path(listingFilePath.getParent(), "chunkDir");
+    fs = chunkRootPath.getFileSystem(configuration);
+    chunkFilePrefix = listingFilePath.getName() + ".chunk.";
+  }
+
+  private static String getListingFilePath(Configuration configuration) {
+    final String listingFileString = configuration.get(
+            DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
+    assert !listingFileString.equals("") : "Listing file not found.";
+    return listingFileString;
+  }
+
+  private static boolean areInvariantsInitialized() {
+    return chunkRootPath != null;
+  }
+
+  private DynamicInputChunk(String chunkId, Configuration configuration)
+                                                      throws IOException {
+    if (!areInvariantsInitialized())
+      initializeChunkInvariants(configuration);
+
+    chunkFilePath = new Path(chunkRootPath, chunkFilePrefix + chunkId);
+    openForWrite();
+  }
+
+
+  private void openForWrite() throws IOException {
+    writer = SequenceFile.createWriter(
+            chunkFilePath.getFileSystem(configuration), configuration,
+            chunkFilePath, Text.class, FileStatus.class,
+            SequenceFile.CompressionType.NONE);
+
+  }
+
+  /**
+   * Factory method to create chunk-files for writing to.
+   * (For instance, when the DynamicInputFormat splits the input-file into
+   * chunks.)
+   * @param chunkId String to identify the chunk.
+   * @param configuration Configuration, describing the location of the listing-
+   * file, file-system for the map-job, etc.
+   * @return A DynamicInputChunk, corresponding to a chunk-file, with the name
+   * incorporating the chunk-id.
+   * @throws IOException Exception on failure to create the chunk.
+   */
+  public static DynamicInputChunk createChunkForWrite(String chunkId,
+                          Configuration configuration) throws IOException {
+    return new DynamicInputChunk(chunkId, configuration);
+  }
+
+  /**
+   * Method to write records into a chunk.
+   * @param key Key from the listing file.
+   * @param value Corresponding value from the listing file.
+   * @throws IOException Exception onf failure to write to the file.
+   */
+  public void write(Text key, FileStatus value) throws IOException {
+    writer.append(key, value);
+  }
+
+  /**
+   * Closes streams opened to the chunk-file.
+   */
+  public void close() {
+    IOUtils.cleanup(LOG, reader, writer);
+  }
+
+  /**
+   * Reassigns the chunk to a specified Map-Task, for consumption.
+   * @param taskId The Map-Task to which a the chunk is to be reassigned.
+   * @throws IOException Exception on failure to reassign.
+   */
+  public void assignTo(TaskID taskId) throws IOException {
+    Path newPath = new Path(chunkRootPath, taskId.toString());
+    if (!fs.rename(chunkFilePath, newPath)) {
+      LOG.warn(chunkFilePath + " could not be assigned to " + taskId);
+    }
+  }
+
+  private DynamicInputChunk(Path chunkFilePath,
+                            TaskAttemptContext taskAttemptContext)
+                                   throws IOException, InterruptedException {
+    if (!areInvariantsInitialized())
+      initializeChunkInvariants(taskAttemptContext.getConfiguration());
+
+    this.chunkFilePath = chunkFilePath;
+    openForRead(taskAttemptContext);
+  }
+
+  private void openForRead(TaskAttemptContext taskAttemptContext)
+          throws IOException, InterruptedException {
+    reader = new SequenceFileRecordReader<K, V>();
+    reader.initialize(new FileSplit(chunkFilePath, 0,
+            DistCpUtils.getFileSize(chunkFilePath, configuration), null),
+            taskAttemptContext);
+  }
+
+  /**
+   * Factory method that
+   * 1. acquires a chunk for the specified map-task attempt
+   * 2. returns a DynamicInputChunk associated with the acquired chunk-file.
+   * @param taskAttemptContext The attempt-context for the map task that's
+   * trying to acquire a chunk.
+   * @return The acquired dynamic-chunk. The chunk-file is renamed to the
+   * attempt-id (from the attempt-context.)
+   * @throws IOException Exception on failure.
+   * @throws InterruptedException Exception on failure.
+   */
+  public static DynamicInputChunk acquire(TaskAttemptContext taskAttemptContext)
+                                      throws IOException, InterruptedException {
+    if (!areInvariantsInitialized())
+        initializeChunkInvariants(taskAttemptContext.getConfiguration());
+
+    String taskId
+            = taskAttemptContext.getTaskAttemptID().getTaskID().toString();
+    Path acquiredFilePath = new Path(chunkRootPath, taskId);
+
+    if (fs.exists(acquiredFilePath)) {
+      LOG.info("Acquiring pre-assigned chunk: " + acquiredFilePath);
+      return new DynamicInputChunk(acquiredFilePath, taskAttemptContext);
+    }
+
+    for (FileStatus chunkFile : getListOfChunkFiles()) {
+      if (fs.rename(chunkFile.getPath(), acquiredFilePath)) {
+        LOG.info(taskId + " acquired " + chunkFile.getPath());
+        return new DynamicInputChunk(acquiredFilePath, taskAttemptContext);
+      }
+      else
+        LOG.warn(taskId + " could not acquire " + chunkFile.getPath());
+    }
+
+    return null;
+  }
+
+  /**
+   * Method to be called to relinquish an acquired chunk. All streams open to
+   * the chunk are closed, and the chunk-file is deleted.
+   * @throws IOException Exception thrown on failure to release (i.e. delete)
+   * the chunk file.
+   */
+  public void release() throws IOException {
+    close();
+    if (!fs.delete(chunkFilePath, false)) {
+      LOG.error("Unable to release chunk at path: " + chunkFilePath);
+      throw new IOException("Unable to release chunk at path: " + chunkFilePath);
+    }
+  }
+
+  static FileStatus [] getListOfChunkFiles() throws IOException {
+    Path chunkFilePattern = new Path(chunkRootPath, chunkFilePrefix + "*");
+    FileStatus chunkFiles[] = fs.globStatus(chunkFilePattern);
+    numChunksLeft = chunkFiles.length;
+    return chunkFiles;
+  }
+
+  /**
+   * Getter for the chunk-file's path, on HDFS.
+   * @return The qualified path to the chunk-file.
+   */
+  public Path getPath() {
+    return chunkFilePath;
+  }
+
+  /**
+   * Getter for the record-reader, opened to the chunk-file.
+   * @return Opened Sequence-file reader.
+   */
+  public SequenceFileRecordReader<K,V> getReader() {
+    assert reader != null : "Reader un-initialized!";
+    return reader;
+  }
+
+  /**
+   * Getter for the number of chunk-files left in the chunk-file directory.
+   * Useful to determine how many chunks (and hence, records) are left to be
+   * processed.
+   * @return Before the first scan of the directory, the number returned is -1.
+   * Otherwise, the number of chunk-files seen from the last scan is returned.
+   */
+  public static int getNumChunksLeft() {
+    return numChunksLeft;
+  }
+}

部分文件因为文件数量过多而无法显示