17 lat temu · 367d0f0025
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -59,6 +59,9 @@ Trunk (unreleased changes)
 
															     "hdfs:" URIs now defaults to 8020, so that one may simply use URIs
														
 
															     of the form "hdfs://example.com/dir/file".
														
 
															+    HADOOP-2019. Adds support for .tar, .tgz and .tar.gz files in 
														
 
															+    DistributedCache (Amareshwari Sriramadasu via ddas)
														
 
															+
														
 
															   IMPROVEMENTS
														
 
															     HADOOP-2928. Remove deprecated FileSystem.getContentLength().
														
--- a/build.xml
+++ b/build.xml
@@ -528,6 +528,9 @@
 
															     <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.txt" todir="${test.cache.data}"/>
														
 
															     <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.jar" todir="${test.cache.data}"/>
														
 
															     <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.zip" todir="${test.cache.data}"/>
														
 
															+    <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.tar" todir="${test.cache.data}"/>
														
 
															+    <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.tgz" todir="${test.cache.data}"/>
														
 
															+    <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.tar.gz" todir="${test.cache.data}"/>
														
 
															     <copy file="${test.src.dir}/org/apache/hadoop/dfs/hadoop-14-dfs-dir.tgz" todir="${test.cache.data}"/>
														
 
															     <copy file="${test.src.dir}/org/apache/hadoop/dfs/hadoop-dfs-dir.txt" todir="${test.cache.data}"/>
														
 
															   </target>
														
--- a/docs/changes.html
+++ b/docs/changes.html
@@ -91,7 +91,7 @@ and history UI.<br />(Amareshwari Sriramadasu via ddas)</li>
 
															     </ol>
														
 
															   </li>
														
 
															   <li><a href="javascript:toggleList('trunk_(unreleased_changes)_._new_features_')">  NEW FEATURES
														
 
															-</a>&nbsp;&nbsp;&nbsp;(4)
														
 
															+</a>&nbsp;&nbsp;&nbsp;(6)
														
 
															     <ol id="trunk_(unreleased_changes)_._new_features_">
														
 
															       <li><a href="http://issues.apache.org/jira/browse/HADOOP-3074">HADOOP-3074</a>. Provides a UrlStreamHandler for DFS and other FS,
														
 
															 relying on FileSystem<br />(taton)</li>
														
@@ -101,10 +101,16 @@ accessible via a NFS mount.<br />(shv)</li>
 
															 Bialecki via omalley)</li>
														
 
															       <li><a href="http://issues.apache.org/jira/browse/HADOOP-2857">HADOOP-2857</a>. Allow libhdfs to set jvm options.<br />(Craig Macdonald
														
 
															 via omalley)</li>
														
 
															+      <li><a href="http://issues.apache.org/jira/browse/HADOOP-3317">HADOOP-3317</a>. Add default port for HDFS namenode.  The port in
														
 
															+"hdfs:" URIs now defaults to 8020, so that one may simply use URIs
														
 
															+of the form "hdfs://example.com/dir/file".
														
 
															+</li>
														
 
															+      <li><a href="http://issues.apache.org/jira/browse/HADOOP-2019">HADOOP-2019</a>. Adds support for .tar, .tgz and .tar.gz files in
														
 
															+DistributedCache<br />(Amareshwari Sriramadasu via ddas)</li>
														
 
															     </ol>
														
 
															   </li>
														
 
															   <li><a href="javascript:toggleList('trunk_(unreleased_changes)_._improvements_')">  IMPROVEMENTS
														
 
															-</a>&nbsp;&nbsp;&nbsp;(10)
														
 
															+</a>&nbsp;&nbsp;&nbsp;(11)
														
 
															     <ol id="trunk_(unreleased_changes)_._improvements_">
														
 
															       <li><a href="http://issues.apache.org/jira/browse/HADOOP-2928">HADOOP-2928</a>. Remove deprecated FileSystem.getContentLength().<br />(Lohit Vjayarenu via rangadi)</li>
														
 
															       <li><a href="http://issues.apache.org/jira/browse/HADOOP-3130">HADOOP-3130</a>. Make the connect timeout smaller for getFile.<br />(Amar Ramesh Kamat via ddas)</li>
														
@@ -129,6 +135,9 @@ fix minor defects, and add eclipse plugin and python unit tests.<br />(nigel)</l
 
															       <li><a href="http://issues.apache.org/jira/browse/HADOOP-3144">HADOOP-3144</a>. Improve robustness of LineRecordReader by defining a maximum
														
 
															 line length (mapred.linerecordreader.maxlength), thereby avoiding reading
														
 
															 too far into the following split.<br />(Zheng Shao via cdouglas)</li>
														
 
															+      <li><a href="http://issues.apache.org/jira/browse/HADOOP-3334">HADOOP-3334</a>. Move lease handling from FSNamesystem into a seperate class.
														
 
															+(Tsz Wo (Nicholas), SZE via rangadi)
														
 
															+</li>
														
 
															     </ol>
														
 
															   </li>
														
 
															   <li><a href="javascript:toggleList('trunk_(unreleased_changes)_._optimizations_')">  OPTIMIZATIONS
														
--- a/docs/mapred_tutorial.html
+++ b/docs/mapred_tutorial.html
@@ -1872,7 +1872,8 @@ document.write("Last Published: " + document.lastModified);
 
															 <p>
														
 
															 <span class="codefrag">DistributedCache</span> can be used to distribute simple, 
														
 
															           read-only data/text files and more complex types such as archives and
														
 
															-          jars. Archives (zip files) are <em>un-archived</em> at the slave nodes.
														
 
															+          jars. Archives (zip, tar, tgz and tar.gz files) are 
														
 
															+          <em>un-archived</em> at the slave nodes.
														
 
															           Optionally users can also direct the <span class="codefrag">DistributedCache</span> to 
														
 
															           <em>symlink</em> the cached file(s) into the <span class="codefrag">current working 
														
 
															           directory</span> of the task via the 
														
--- a/docs/mapred_tutorial.pdf
+++ b/docs/mapred_tutorial.pdf
--- a/src/docs/src/documentation/content/xdocs/mapred_tutorial.xml
+++ b/src/docs/src/documentation/content/xdocs/mapred_tutorial.xml
@@ -1416,7 +1416,8 @@
 
															           <p><code>DistributedCache</code> can be used to distribute simple, 
														
 
															           read-only data/text files and more complex types such as archives and
														
 
															-          jars. Archives (zip files) are <em>un-archived</em> at the slave nodes.
														
 
															+          jars. Archives (zip, tar, tgz and tar.gz files) are 
														
 
															+          <em>un-archived</em> at the slave nodes.
														
 
															           Optionally users can also direct the <code>DistributedCache</code> to 
														
 
															           <em>symlink</em> the cached file(s) into the <code>current working 
														
 
															           directory</code> of the task via the 
														
--- a/src/java/org/apache/hadoop/filecache/DistributedCache.java
+++ b/src/java/org/apache/hadoop/filecache/DistributedCache.java
@@ -50,11 +50,11 @@ import java.net.URI;
 
															  *
														
 
															  * <p><code>DistributedCache</code> can be used to distribute simple, read-only
														
 
															  * data/text files and/or more complex types such as archives, jars etc. 
														
 
															- * Archives (zip files) are un-archived at the slave nodes. Jars maybe be 
														
 
															- * optionally added to the classpath of the tasks, a rudimentary software
														
 
															- * distribution mechanism.  Files have execution permissions. Optionally users 
														
 
															- * can also direct it to symlink the distributed cache file(s) into 
														
 
															- * the working directory of the task.</p>
														
 
															+ * Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. 
														
 
															+ * Jars may be optionally added to the classpath of the tasks, a rudimentary 
														
 
															+ * software distribution mechanism.  Files have execution permissions.
														
 
															+ * Optionally users can also direct it to symlink the distributed cache file(s)
														
 
															+ * into the working directory of the task.</p>
														
 
															  * 
														
 
															  * <p><code>DistributedCache</code> tracks modification timestamps of the cache 
														
 
															  * files. Clearly the cache files should not be modified by the application 
														
@@ -70,6 +70,9 @@ import java.net.URI;
 
															  *     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
														
 
															  *     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
														
 
															  *     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
														
 
															+ *     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
														
 
															+ *     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
														
 
															+ *     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
														
 
															  *     
														
 
															  *     2. Setup the application's <code>JobConf</code>:
														
 
															  *     
														
@@ -78,7 +81,10 @@ import java.net.URI;
 
															  *                                   job);
														
 
															  *     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
														
 
															  *     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
														
 
															- *
														
 
															+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
														
 
															+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
														
 
															+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
														
 
															+ *     
														
 
															  *     3. Use the cached files in the {@link Mapper} or {@link Reducer}:
														
 
															  *     
														
 
															  *     public static class MapClass extends MapReduceBase  
														
@@ -129,9 +135,11 @@ public class DistributedCache {
 
															    * @param conf The Confguration file which contains the filesystem
														
 
															    * @param baseDir The base cache Dir where you wnat to localize the files/archives
														
 
															    * @param fileStatus The file status on the dfs.
														
 
															-   * @param isArchive if the cache is an archive or a file. In case it is an archive
														
 
															-   *  with a .zip or .jar extension it will be unzipped/unjarred automatically 
														
 
															-   *  and the directory where the archive is unjarred is returned as the Path.
														
 
															+   * @param isArchive if the cache is an archive or a file. In case it is an
														
 
															+   *  archive with a .zip or .jar or .tar or .tgz or .tar.gz extension it will
														
 
															+   *  be unzipped/unjarred/untarred automatically 
														
 
															+   *  and the directory where the archive is unzipped/unjarred/untarred is
														
 
															+   *  returned as the Path.
														
 
															    *  In case of a file, the path to the file is returned
														
 
															    * @param confFileStamp this is the hdfs file modification timestamp to verify that the 
														
 
															    * file to be cached hasn't changed since the job started
														
@@ -185,9 +193,11 @@ public class DistributedCache {
 
															    * being used in the Configuration
														
 
															    * @param conf The Confguration file which contains the filesystem
														
 
															    * @param baseDir The base cache Dir where you wnat to localize the files/archives
														
 
															-   * @param isArchive if the cache is an archive or a file. In case it is an archive
														
 
															-   *  with a .zip or .jar extension it will be unzipped/unjarred automatically 
														
 
															-   *  and the directory where the archive is unjarred is returned as the Path.
														
 
															+   * @param isArchive if the cache is an archive or a file. In case it is an 
														
 
															+   *  archive with a .zip or .jar or .tar or .tgz or .tar.gz extension it will 
														
 
															+   *  be unzipped/unjarred/untarred automatically 
														
 
															+   *  and the directory where the archive is unzipped/unjarred/untarred 
														
 
															+   *  is returned as the Path.
														
 
															    *  In case of a file, the path to the file is returned
														
 
															    * @param confFileStamp this is the hdfs file modification timestamp to verify that the 
														
 
															    * file to be cached hasn't changed since the job started
														
@@ -331,13 +341,14 @@ public class DistributedCache {
 
															       fs.copyToLocalFile(new Path(cacheId), parchive);
														
 
															       if (isArchive) {
														
 
															         String tmpArchive = parchive.toString().toLowerCase();
														
 
															+        File srcFile = new File(parchive.toString());
														
 
															+        File destDir = new File(parchive.getParent().toString());
														
 
															         if (tmpArchive.endsWith(".jar")) {
														
 
															-          RunJar.unJar(new File(parchive.toString()), new File(parchive
														
 
															-                                                               .getParent().toString()));
														
 
															+          RunJar.unJar(srcFile, destDir);
														
 
															         } else if (tmpArchive.endsWith(".zip")) {
														
 
															-          FileUtil.unZip(new File(parchive.toString()), new File(parchive
														
 
															-                                                                 .getParent().toString()));
														
 
															-
														
 
															+          FileUtil.unZip(srcFile, destDir);
														
 
															+        } else if (isTarFile(tmpArchive)) {
														
 
															+          FileUtil.unTar(srcFile, destDir);
														
 
															         }
														
 
															         // else will not do anyhting
														
 
															         // and copy the file into the dir as it is
														
@@ -373,6 +384,11 @@ public class DistributedCache {
 
															     }
														
 
															   }
														
 
															+  private static boolean isTarFile(String filename) {
														
 
															+    return (filename.endsWith(".tgz") || filename.endsWith(".tar.gz") ||
														
 
															+           filename.endsWith(".tar"));
														
 
															+  }
														
 
															+  
														
 
															   // Checks if the cache has already been localized and is fresh
														
 
															   private static boolean ifExistsAndFresh(Configuration conf, FileSystem fs, 
														
 
															                                           URI cache, long confFileStamp, 
														
--- a/src/java/org/apache/hadoop/fs/FileUtil.java
+++ b/src/java/org/apache/hadoop/fs/FileUtil.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.conf.Configuration;
 
															 import org.apache.hadoop.io.IOUtils;
														
 
															 import org.apache.hadoop.util.StringUtils;
														
 
															 import org.apache.hadoop.util.Shell;
														
 
															+import org.apache.hadoop.util.Shell.ShellCommandExecutor;
														
 
															 /**
														
 
															  * A collection of file-processing util methods
														
@@ -453,7 +454,51 @@ public class FileUtil {
 
															       zipFile.close();
														
 
															     }
														
 
															   }
														
 
															-  
														
 
															+
														
 
															+  /**
														
 
															+   * Given a Tar File as input it will untar the file in a the untar directory
														
 
															+   * passed as the second parameter
														
 
															+   * 
														
 
															+   * This utility will untar ".tar" files and ".tar.gz","tgz" files.
														
 
															+   *  
														
 
															+   * @param inFile The tar file as input. 
														
 
															+   * @param untarDir The untar directory where to untar the tar file.
														
 
															+   * @throws IOException
														
 
															+   */
														
 
															+  public static void unTar(File inFile, File untarDir) throws IOException {
														
 
															+    if (!untarDir.mkdirs()) {           
														
 
															+      if (!untarDir.isDirectory()) {
														
 
															+        throw new IOException("Mkdirs failed to create " + untarDir);
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    StringBuffer untarCommand = new StringBuffer();
														
 
															+    boolean gzipped = inFile.toString().endsWith("gz");
														
 
															+    if (gzipped) {
														
 
															+      untarCommand.append(" gzip -dc '");
														
 
															+      untarCommand.append(FileUtil.makeShellPath(inFile));
														
 
															+      untarCommand.append("' | (");
														
 
															+    } 
														
 
															+    untarCommand.append("cd '");
														
 
															+    untarCommand.append(FileUtil.makeShellPath(untarDir)); 
														
 
															+    untarCommand.append("' ; ");
														
 
															+    untarCommand.append("tar -xf ");
														
 
															+    
														
 
															+    if (gzipped) {
														
 
															+      untarCommand.append(" -)");
														
 
															+    } else {
														
 
															+      untarCommand.append(FileUtil.makeShellPath(inFile));
														
 
															+    }
														
 
															+    String[] shellCmd = { "bash", "-c", untarCommand.toString() };
														
 
															+    ShellCommandExecutor shexec = new ShellCommandExecutor(shellCmd);
														
 
															+    shexec.execute();
														
 
															+    int exitcode = shexec.getExitCode();
														
 
															+    if (exitcode != 0) {
														
 
															+      throw new IOException("Error untarring file " + inFile + 
														
 
															+                  ". Tar process exited with exit code " + exitcode);
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															   /**
														
 
															    * Class for creating hardlinks.
														
 
															    * Supports Unix, Cygwin, WindXP.
														
--- a/src/test/org/apache/hadoop/dfs/TestDFSUpgradeFromImage.java
+++ b/src/test/org/apache/hadoop/dfs/TestDFSUpgradeFromImage.java
@@ -27,7 +27,6 @@ import java.util.TreeMap;
 
															 import java.util.zip.CRC32;
														
 
															 import org.apache.hadoop.conf.Configuration;
														
 
															-import org.apache.hadoop.util.Shell;
														
 
															 import org.apache.hadoop.fs.FSInputStream;
														
 
															 import org.apache.hadoop.fs.FileUtil;
														
 
															 import org.apache.hadoop.dfs.FSConstants.StartupOption;
														
@@ -72,12 +71,7 @@ public class TestDFSUpgradeFromImage extends TestCase {
 
															     if ( dfsDir.exists() && !FileUtil.fullyDelete(dfsDir) ) {
														
 
															       throw new IOException("Could not delete dfs directory '" + dfsDir + "'");
														
 
															     }
														
 
															-    String cmd = "gzip -dc '" + FileUtil.makeShellPath(tarFile) + "' | (cd '" +
														
 
															-                 FileUtil.makeShellPath(dataDir) + "' ; tar -xf -)";
														
 
															-    LOG.info("Unpacking the tar file. Cmd : " + cmd);
														
 
															-    String[] shellCmd = { "bash", "-c", cmd };
														
 
															-    Shell.execCommand(shellCmd);
														
 
															-    
														
 
															+    FileUtil.unTar(new File(tarFile), new File(dataDir));
														
 
															     //Now read the reference info
														
 
															     BufferedReader reader = new BufferedReader( 
														
--- a/src/test/org/apache/hadoop/mapred/MRCaching.java
+++ b/src/test/org/apache/hadoop/mapred/MRCaching.java
@@ -175,6 +175,9 @@ public class MRCaching {
 
															     Path txtPath = new Path(localPath, new Path("test.txt"));
														
 
															     Path jarPath = new Path(localPath, new Path("test.jar"));
														
 
															     Path zipPath = new Path(localPath, new Path("test.zip"));
														
 
															+    Path tarPath = new Path(localPath, new Path("test.tgz"));
														
 
															+    Path tarPath1 = new Path(localPath, new Path("test.tar.gz"));
														
 
															+    Path tarPath2 = new Path(localPath, new Path("test.tar"));
														
 
															     Path cachePath = new Path(cacheDir);
														
 
															     fs.delete(cachePath, true);
														
 
															     if (!fs.mkdirs(cachePath)) {
														
@@ -183,13 +186,23 @@ public class MRCaching {
 
															     fs.copyFromLocalFile(txtPath, cachePath);
														
 
															     fs.copyFromLocalFile(jarPath, cachePath);
														
 
															     fs.copyFromLocalFile(zipPath, cachePath);
														
 
															+    fs.copyFromLocalFile(tarPath, cachePath);
														
 
															+    fs.copyFromLocalFile(tarPath1, cachePath);
														
 
															+    fs.copyFromLocalFile(tarPath2, cachePath);
														
 
															     // setting the cached archives to zip, jar and simple text files
														
 
															     URI uri1 = fs.getUri().resolve(cachePath + "/test.jar");
														
 
															     URI uri2 = fs.getUri().resolve(cachePath + "/test.zip");
														
 
															     URI uri3 = fs.getUri().resolve(cachePath + "/test.txt");
														
 
															+    URI uri4 = fs.getUri().resolve(cachePath + "/test.tgz");
														
 
															+    URI uri5 = fs.getUri().resolve(cachePath + "/test.tar.gz");
														
 
															+    URI uri6 = fs.getUri().resolve(cachePath + "/test.tar");
														
 
															+
														
 
															     DistributedCache.addCacheArchive(uri1, conf);
														
 
															     DistributedCache.addCacheArchive(uri2, conf);
														
 
															     DistributedCache.addCacheFile(uri3, conf);
														
 
															+    DistributedCache.addCacheArchive(uri4, conf);
														
 
															+    DistributedCache.addCacheArchive(uri5, conf);
														
 
															+    DistributedCache.addCacheArchive(uri6, conf);
														
 
															     RunningJob job = JobClient.runJob(conf);
														
 
															     int count = 0;
														
 
															     // after the job ran check to see if the the input from the localized cache
														
@@ -208,7 +221,7 @@ public class MRCaching {
 
															       }
														
 
															       file.close();
														
 
															     }
														
 
															-    if (count != 3)
														
 
															+    if (count != 6)
														
 
															       return new TestResult(job, false);
														
 
															     return new TestResult(job, true);
														
--- a/src/test/org/apache/hadoop/mapred/test.tar
+++ b/src/test/org/apache/hadoop/mapred/test.tar
--- a/src/test/org/apache/hadoop/mapred/test.tar.gz
+++ b/src/test/org/apache/hadoop/mapred/test.tar.gz
--- a/src/test/org/apache/hadoop/mapred/test.tgz
+++ b/src/test/org/apache/hadoop/mapred/test.tgz