8 年之前 · 0cfa6ec6c8
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/filecache/DistributedCache.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/filecache/DistributedCache.java
@@ -32,79 +32,79 @@ import org.apache.hadoop.mapreduce.MRJobConfig;
 
				 
			
 
				 /**
			
 
				  * Distribute application-specific large, read-only files efficiently.
			
 
				- * 
			
 
				+ *
			
 
				  * <p><code>DistributedCache</code> is a facility provided by the Map-Reduce
			
 
				  * framework to cache files (text, archives, jars etc.) needed by applications.
			
 
				  * </p>
			
 
				- * 
			
 
				- * <p>Applications specify the files, via urls (hdfs:// or http://) to be cached 
			
 
				+ *
			
 
				+ * <p>Applications specify the files, via urls (hdfs:// or http://) to be cached
			
 
				  * via the {@link org.apache.hadoop.mapred.JobConf}. The
			
 
				  * <code>DistributedCache</code> assumes that the files specified via urls are
			
 
				  * already present on the {@link FileSystem} at the path specified by the url
			
 
				  * and are accessible by every machine in the cluster.</p>
			
 
				- * 
			
 
				- * <p>The framework will copy the necessary files on to the slave node before 
			
 
				- * any tasks for the job are executed on that node. Its efficiency stems from 
			
 
				- * the fact that the files are only copied once per job and the ability to 
			
 
				- * cache archives which are un-archived on the slaves.</p> 
			
 
				+ *
			
 
				+ * <p>The framework will copy the necessary files on to the worker node before
			
 
				+ * any tasks for the job are executed on that node. Its efficiency stems from
			
 
				+ * the fact that the files are only copied once per job and the ability to
			
 
				+ * cache archives which are un-archived on the workers.</p>
			
 
				  *
			
 
				  * <p><code>DistributedCache</code> can be used to distribute simple, read-only
			
 
				- * data/text files and/or more complex types such as archives, jars etc. 
			
 
				- * Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. 
			
 
				- * Jars may be optionally added to the classpath of the tasks, a rudimentary 
			
 
				+ * data/text files and/or more complex types such as archives, jars etc.
			
 
				+ * Archives (zip, tar and tgz/tar.gz files) are un-archived at the worker nodes.
			
 
				+ * Jars may be optionally added to the classpath of the tasks, a rudimentary
			
 
				  * software distribution mechanism.  Files have execution permissions.
			
 
				  * In older version of Hadoop Map/Reduce users could optionally ask for symlinks
			
 
				- * to be created in the working directory of the child task.  In the current 
			
 
				- * version symlinks are always created.  If the URL does not have a fragment 
			
 
				- * the name of the file or directory will be used. If multiple files or 
			
 
				+ * to be created in the working directory of the child task.  In the current
			
 
				+ * version symlinks are always created.  If the URL does not have a fragment
			
 
				+ * the name of the file or directory will be used. If multiple files or
			
 
				  * directories map to the same link name, the last one added, will be used.  All
			
 
				  * others will not even be downloaded.</p>
			
 
				- * 
			
 
				- * <p><code>DistributedCache</code> tracks modification timestamps of the cache 
			
 
				- * files. Clearly the cache files should not be modified by the application 
			
 
				+ *
			
 
				+ * <p><code>DistributedCache</code> tracks modification timestamps of the cache
			
 
				+ * files. Clearly the cache files should not be modified by the application
			
 
				  * or externally while the job is executing.</p>
			
 
				- * 
			
 
				- * <p>Here is an illustrative example on how to use the 
			
 
				+ *
			
 
				+ * <p>Here is an illustrative example on how to use the
			
 
				  * <code>DistributedCache</code>:</p>
			
 
				  * <p><blockquote><pre>
			
 
				  *     // Setting up the cache for the application
			
 
				- *     
			
 
				+ *
			
 
				  *     1. Copy the requisite files to the <code>FileSystem</code>:
			
 
				- *     
			
 
				- *     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
			
 
				- *     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
			
 
				+ *
			
 
				+ *     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
			
 
				+ *     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
			
 
				  *     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
			
 
				  *     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
			
 
				  *     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
			
 
				  *     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
			
 
				- *     
			
 
				+ *
			
 
				  *     2. Setup the application's <code>JobConf</code>:
			
 
				- *     
			
 
				+ *
			
 
				  *     JobConf job = new JobConf();
			
 
				- *     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"), 
			
 
				+ *     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
			
 
				  *                                   job);
			
 
				- *     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/map.zip"), job);
			
 
				  *     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
			
 
				- *     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
			
 
				- *     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
			
 
				- *     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
			
 
				- *     
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar"), job);
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz"), job);
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz"), job);
			
 
				+ *
			
 
				  *     3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
			
 
				  *     or {@link org.apache.hadoop.mapred.Reducer}:
			
 
				- *     
			
 
				- *     public static class MapClass extends MapReduceBase  
			
 
				+ *
			
 
				+ *     public static class MapClass extends MapReduceBase
			
 
				  *     implements Mapper&lt;K, V, K, V&gt; {
			
 
				- *     
			
 
				+ *
			
 
				  *       private Path[] localArchives;
			
 
				  *       private Path[] localFiles;
			
 
				- *       
			
 
				+ *
			
 
				  *       public void configure(JobConf job) {
			
 
				  *         // Get the cached archives/files
			
 
				  *         File f = new File("./map.zip/some/file/in/zip.txt");
			
 
				  *       }
			
 
				- *       
			
 
				- *       public void map(K key, V value, 
			
 
				- *                       OutputCollector&lt;K, V&gt; output, Reporter reporter) 
			
 
				+ *
			
 
				+ *       public void map(K key, V value,
			
 
				+ *                       OutputCollector&lt;K, V&gt; output, Reporter reporter)
			
 
				  *       throws IOException {
			
 
				  *         // Use data from the cached archives/files here
			
 
				  *         // ...
			
@@ -112,7 +112,7 @@ import org.apache.hadoop.mapreduce.MRJobConfig;
 
				  *         output.collect(k, v);
			
 
				  *       }
			
 
				  *     }
			
 
				- *     
			
 
				+ *
			
 
				  * </pre></blockquote>
			
 
				  *
			
 
				  * It is also very common to use the DistributedCache by using
			
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/filecache/DistributedCache.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/filecache/DistributedCache.java
@@ -34,79 +34,79 @@ import java.net.URI;
 
				 
			
 
				 /**
			
 
				  * Distribute application-specific large, read-only files efficiently.
			
 
				- * 
			
 
				+ *
			
 
				  * <p><code>DistributedCache</code> is a facility provided by the Map-Reduce
			
 
				  * framework to cache files (text, archives, jars etc.) needed by applications.
			
 
				  * </p>
			
 
				- * 
			
 
				- * <p>Applications specify the files, via urls (hdfs:// or http://) to be cached 
			
 
				+ *
			
 
				+ * <p>Applications specify the files, via urls (hdfs:// or http://) to be cached
			
 
				  * via the {@link org.apache.hadoop.mapred.JobConf}. The
			
 
				  * <code>DistributedCache</code> assumes that the files specified via urls are
			
 
				  * already present on the {@link FileSystem} at the path specified by the url
			
 
				  * and are accessible by every machine in the cluster.</p>
			
 
				- * 
			
 
				- * <p>The framework will copy the necessary files on to the slave node before 
			
 
				- * any tasks for the job are executed on that node. Its efficiency stems from 
			
 
				- * the fact that the files are only copied once per job and the ability to 
			
 
				- * cache archives which are un-archived on the slaves.</p> 
			
 
				+ *
			
 
				+ * <p>The framework will copy the necessary files on to the worker node before
			
 
				+ * any tasks for the job are executed on that node. Its efficiency stems from
			
 
				+ * the fact that the files are only copied once per job and the ability to
			
 
				+ * cache archives which are un-archived on the workers.</p>
			
 
				  *
			
 
				  * <p><code>DistributedCache</code> can be used to distribute simple, read-only
			
 
				- * data/text files and/or more complex types such as archives, jars etc. 
			
 
				- * Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. 
			
 
				- * Jars may be optionally added to the classpath of the tasks, a rudimentary 
			
 
				+ * data/text files and/or more complex types such as archives, jars etc.
			
 
				+ * Archives (zip, tar and tgz/tar.gz files) are un-archived at the worker nodes.
			
 
				+ * Jars may be optionally added to the classpath of the tasks, a rudimentary
			
 
				  * software distribution mechanism.  Files have execution permissions.
			
 
				  * In older version of Hadoop Map/Reduce users could optionally ask for symlinks
			
 
				- * to be created in the working directory of the child task.  In the current 
			
 
				- * version symlinks are always created.  If the URL does not have a fragment 
			
 
				- * the name of the file or directory will be used. If multiple files or 
			
 
				+ * to be created in the working directory of the child task.  In the current
			
 
				+ * version symlinks are always created.  If the URL does not have a fragment
			
 
				+ * the name of the file or directory will be used. If multiple files or
			
 
				  * directories map to the same link name, the last one added, will be used.  All
			
 
				  * others will not even be downloaded.</p>
			
 
				- * 
			
 
				- * <p><code>DistributedCache</code> tracks modification timestamps of the cache 
			
 
				- * files. Clearly the cache files should not be modified by the application 
			
 
				+ *
			
 
				+ * <p><code>DistributedCache</code> tracks modification timestamps of the cache
			
 
				+ * files. Clearly the cache files should not be modified by the application
			
 
				  * or externally while the job is executing.</p>
			
 
				- * 
			
 
				- * <p>Here is an illustrative example on how to use the 
			
 
				+ *
			
 
				+ * <p>Here is an illustrative example on how to use the
			
 
				  * <code>DistributedCache</code>:</p>
			
 
				  * <p><blockquote><pre>
			
 
				  *     // Setting up the cache for the application
			
 
				- *     
			
 
				+ *
			
 
				  *     1. Copy the requisite files to the <code>FileSystem</code>:
			
 
				- *     
			
 
				- *     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
			
 
				- *     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
			
 
				+ *
			
 
				+ *     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
			
 
				+ *     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
			
 
				  *     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
			
 
				  *     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
			
 
				  *     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
			
 
				  *     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
			
 
				- *     
			
 
				+ *
			
 
				  *     2. Setup the application's <code>JobConf</code>:
			
 
				- *     
			
 
				+ *
			
 
				  *     JobConf job = new JobConf();
			
 
				- *     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"), 
			
 
				+ *     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
			
 
				  *                                   job);
			
 
				- *     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/map.zip"), job);
			
 
				  *     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
			
 
				- *     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
			
 
				- *     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
			
 
				- *     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
			
 
				- *     
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar"), job);
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz"), job);
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz"), job);
			
 
				+ *
			
 
				  *     3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
			
 
				  *     or {@link org.apache.hadoop.mapred.Reducer}:
			
 
				- *     
			
 
				- *     public static class MapClass extends MapReduceBase  
			
 
				+ *
			
 
				+ *     public static class MapClass extends MapReduceBase
			
 
				  *     implements Mapper&lt;K, V, K, V&gt; {
			
 
				- *     
			
 
				+ *
			
 
				  *       private Path[] localArchives;
			
 
				  *       private Path[] localFiles;
			
 
				- *       
			
 
				+ *
			
 
				  *       public void configure(JobConf job) {
			
 
				  *         // Get the cached archives/files
			
 
				  *         File f = new File("./map.zip/some/file/in/zip.txt");
			
 
				  *       }
			
 
				- *       
			
 
				- *       public void map(K key, V value, 
			
 
				- *                       OutputCollector&lt;K, V&gt; output, Reporter reporter) 
			
 
				+ *
			
 
				+ *       public void map(K key, V value,
			
 
				+ *                       OutputCollector&lt;K, V&gt; output, Reporter reporter)
			
 
				  *       throws IOException {
			
 
				  *         // Use data from the cached archives/files here
			
 
				  *         // ...
			
@@ -114,7 +114,7 @@ import java.net.URI;
 
				  *         output.collect(k, v);
			
 
				  *       }
			
 
				  *     }
			
 
				- *     
			
 
				+ *
			
 
				  * </pre></blockquote>
			
 
				  *
			
 
				  * It is also very common to use the DistributedCache by using
			
@@ -281,7 +281,7 @@ public class DistributedCache {
 
				     conf.set(MRJobConfig.CACHE_ARCHIVES, archives == null ? uri.toString()
			
 
				              : archives + "," + uri.toString());
			
 
				   }
			
 
				-  
			
 
				+
			
 
				   /**
			
 
				    * Add a file to be localized to the conf.  The localized file will be
			
 
				    * downloaded to the execution node(s), and a link will created to the
			
@@ -370,7 +370,7 @@ public class DistributedCache {
 
				   /**
			
 
				    * Get the file entries in classpath as an array of Path.
			
 
				    * Used by internal DistributedCache code.
			
 
				-   * 
			
 
				+   *
			
 
				    * @param conf Configuration that contains the classpath setting
			
 
				    * @deprecated Use {@link JobContext#getFileClassPaths()} instead
			
 
				    * @see JobContext#getFileClassPaths()
			
@@ -379,8 +379,8 @@ public class DistributedCache {
 
				   public static Path[] getFileClassPaths(Configuration conf) {
			
 
				     ArrayList<String> list = (ArrayList<String>)conf.getStringCollection(
			
 
				                                 MRJobConfig.CLASSPATH_FILES);
			
 
				-    if (list.size() == 0) { 
			
 
				-      return null; 
			
 
				+    if (list.size() == 0) {
			
 
				+      return null;
			
 
				     }
			
 
				     Path[] paths = new Path[list.size()];
			
 
				     for (int i = 0; i < list.size(); i++) {
			
@@ -392,7 +392,7 @@ public class DistributedCache {
 
				   /**
			
 
				    * Add an archive path to the current set of classpath entries. It adds the
			
 
				    * archive to cache as well.  Intended to be used by user code.
			
 
				-   * 
			
 
				+   *
			
 
				    * @param archive Path of the archive to be added
			
 
				    * @param conf Configuration that contains the classpath setting
			
 
				    * @deprecated Use {@link Job#addArchiveToClassPath(Path)} instead
			
@@ -426,7 +426,7 @@ public class DistributedCache {
 
				   /**
			
 
				    * Get the archive entries in classpath as an array of Path.
			
 
				    * Used by internal DistributedCache code.
			
 
				-   * 
			
 
				+   *
			
 
				    * @param conf Configuration that contains the classpath setting
			
 
				    * @deprecated Use {@link JobContext#getArchiveClassPaths()} instead 
			
 
				    * @see JobContext#getArchiveClassPaths()
			
@@ -435,8 +435,8 @@ public class DistributedCache {
 
				   public static Path[] getArchiveClassPaths(Configuration conf) {
			
 
				     ArrayList<String> list = (ArrayList<String>)conf.getStringCollection(
			
 
				                                 MRJobConfig.CLASSPATH_ARCHIVES);
			
 
				-    if (list.size() == 0) { 
			
 
				-      return null; 
			
 
				+    if (list.size() == 0) {
			
 
				+      return null;
			
 
				     }
			
 
				     Path[] paths = new Path[list.size()];
			
 
				     for (int i = 0; i < list.size(); i++) {
			
@@ -449,13 +449,13 @@ public class DistributedCache {
 
				    * Originally intended to enable symlinks, but currently symlinks cannot be
			
 
				    * disabled. This is a NO-OP.
			
 
				    * @param conf the jobconf
			
 
				-   * @deprecated This is a NO-OP. 
			
 
				+   * @deprecated This is a NO-OP.
			
 
				    */
			
 
				   @Deprecated
			
 
				   public static void createSymlink(Configuration conf){
			
 
				     //NOOP
			
 
				   }
			
 
				-  
			
 
				+
			
 
				   /**
			
 
				    * Originally intended to check if symlinks should be used, but currently
			
 
				    * symlinks cannot be disabled.
			
@@ -480,29 +480,29 @@ public class DistributedCache {
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * Get the booleans on whether the files are public or not.  Used by 
			
 
				+   * Get the booleans on whether the files are public or not.  Used by
			
 
				    * internal DistributedCache and MapReduce code.
			
 
				    * @param conf The configuration which stored the timestamps
			
 
				-   * @return a string array of booleans 
			
 
				+   * @return a string array of booleans
			
 
				    */
			
 
				   public static boolean[] getFileVisibilities(Configuration conf) {
			
 
				     return parseBooleans(conf.getStrings(MRJobConfig.CACHE_FILE_VISIBILITIES));
			
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * Get the booleans on whether the archives are public or not.  Used by 
			
 
				+   * Get the booleans on whether the archives are public or not.  Used by
			
 
				    * internal DistributedCache and MapReduce code.
			
 
				    * @param conf The configuration which stored the timestamps
			
 
				-   * @return a string array of booleans 
			
 
				+   * @return a string array of booleans
			
 
				    */
			
 
				   public static boolean[] getArchiveVisibilities(Configuration conf) {
			
 
				     return parseBooleans(conf.getStrings(MRJobConfig.CACHE_ARCHIVES_VISIBILITIES));
			
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * This method checks if there is a conflict in the fragment names 
			
 
				-   * of the uris. Also makes sure that each uri has a fragment. It 
			
 
				-   * is only to be called if you want to create symlinks for 
			
 
				+   * This method checks if there is a conflict in the fragment names
			
 
				+   * of the uris. Also makes sure that each uri has a fragment. It
			
 
				+   * is only to be called if you want to create symlinks for
			
 
				    * the various archives and files.  May be used by user code.
			
 
				    * @param uriFiles The uri array of urifiles
			
 
				    * @param uriArchives the uri array of uri archives
			
@@ -514,7 +514,7 @@ public class DistributedCache {
 
				     // check if fragment is null for any uri
			
 
				     // also check if there are any conflicts in fragment names
			
 
				     Set<String> fragments = new HashSet<String>();
			
 
				-    
			
 
				+
			
 
				     // iterate over file uris
			
 
				     if (uriFiles != null) {
			
 
				       for (int i = 0; i < uriFiles.length; i++) {
			
@@ -529,7 +529,7 @@ public class DistributedCache {
 
				         fragments.add(lowerCaseFragment);
			
 
				       }
			
 
				     }
			
 
				-    
			
 
				+
			
 
				     // iterate over archive uris
			
 
				     if (uriArchives != null) {
			
 
				       for (int i = 0; i < uriArchives.length; i++) {