17 years ago · 367d0f0025
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -59,6 +59,9 @@ Trunk (unreleased changes)
 
				     "hdfs:" URIs now defaults to 8020, so that one may simply use URIs
			
 
				     of the form "hdfs://example.com/dir/file".
			
 
				 
			
 
				+    HADOOP-2019. Adds support for .tar, .tgz and .tar.gz files in 
			
 
				+    DistributedCache (Amareshwari Sriramadasu via ddas)
			
 
				+
			
 
				   IMPROVEMENTS
			
 
				    
			
 
				     HADOOP-2928. Remove deprecated FileSystem.getContentLength().
			
--- a/build.xml
+++ b/build.xml
@@ -528,6 +528,9 @@
 
				     <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.txt" todir="${test.cache.data}"/>
			
 
				     <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.jar" todir="${test.cache.data}"/>
			
 
				     <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.zip" todir="${test.cache.data}"/>
			
 
				+    <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.tar" todir="${test.cache.data}"/>
			
 
				+    <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.tgz" todir="${test.cache.data}"/>
			
 
				+    <copy file="${test.src.dir}/org/apache/hadoop/mapred/test.tar.gz" todir="${test.cache.data}"/>
			
 
				     <copy file="${test.src.dir}/org/apache/hadoop/dfs/hadoop-14-dfs-dir.tgz" todir="${test.cache.data}"/>
			
 
				     <copy file="${test.src.dir}/org/apache/hadoop/dfs/hadoop-dfs-dir.txt" todir="${test.cache.data}"/>
			
 
				   </target>
			
--- a/docs/changes.html
+++ b/docs/changes.html
@@ -91,7 +91,7 @@ and history UI.<br />(Amareshwari Sriramadasu via ddas)</li>
 
				     </ol>
			
 
				   </li>
			
 
				   <li><a href="javascript:toggleList('trunk_(unreleased_changes)_._new_features_')">  NEW FEATURES
			
 
				-</a>&nbsp;&nbsp;&nbsp;(4)
			
 
				+</a>&nbsp;&nbsp;&nbsp;(6)
			
 
				     <ol id="trunk_(unreleased_changes)_._new_features_">
			
 
				       <li><a href="http://issues.apache.org/jira/browse/HADOOP-3074">HADOOP-3074</a>. Provides a UrlStreamHandler for DFS and other FS,
			
 
				 relying on FileSystem<br />(taton)</li>
			
@@ -101,10 +101,16 @@ accessible via a NFS mount.<br />(shv)</li>
 
				 Bialecki via omalley)</li>
			
 
				       <li><a href="http://issues.apache.org/jira/browse/HADOOP-2857">HADOOP-2857</a>. Allow libhdfs to set jvm options.<br />(Craig Macdonald
			
 
				 via omalley)</li>
			
 
				+      <li><a href="http://issues.apache.org/jira/browse/HADOOP-3317">HADOOP-3317</a>. Add default port for HDFS namenode.  The port in
			
 
				+"hdfs:" URIs now defaults to 8020, so that one may simply use URIs
			
 
				+of the form "hdfs://example.com/dir/file".
			
 
				+</li>
			
 
				+      <li><a href="http://issues.apache.org/jira/browse/HADOOP-2019">HADOOP-2019</a>. Adds support for .tar, .tgz and .tar.gz files in
			
 
				+DistributedCache<br />(Amareshwari Sriramadasu via ddas)</li>
			
 
				     </ol>
			
 
				   </li>
			
 
				   <li><a href="javascript:toggleList('trunk_(unreleased_changes)_._improvements_')">  IMPROVEMENTS
			
 
				-</a>&nbsp;&nbsp;&nbsp;(10)
			
 
				+</a>&nbsp;&nbsp;&nbsp;(11)
			
 
				     <ol id="trunk_(unreleased_changes)_._improvements_">
			
 
				       <li><a href="http://issues.apache.org/jira/browse/HADOOP-2928">HADOOP-2928</a>. Remove deprecated FileSystem.getContentLength().<br />(Lohit Vjayarenu via rangadi)</li>
			
 
				       <li><a href="http://issues.apache.org/jira/browse/HADOOP-3130">HADOOP-3130</a>. Make the connect timeout smaller for getFile.<br />(Amar Ramesh Kamat via ddas)</li>
			
@@ -129,6 +135,9 @@ fix minor defects, and add eclipse plugin and python unit tests.<br />(nigel)</l
 
				       <li><a href="http://issues.apache.org/jira/browse/HADOOP-3144">HADOOP-3144</a>. Improve robustness of LineRecordReader by defining a maximum
			
 
				 line length (mapred.linerecordreader.maxlength), thereby avoiding reading
			
 
				 too far into the following split.<br />(Zheng Shao via cdouglas)</li>
			
 
				+      <li><a href="http://issues.apache.org/jira/browse/HADOOP-3334">HADOOP-3334</a>. Move lease handling from FSNamesystem into a seperate class.
			
 
				+(Tsz Wo (Nicholas), SZE via rangadi)
			
 
				+</li>
			
 
				     </ol>
			
 
				   </li>
			
 
				   <li><a href="javascript:toggleList('trunk_(unreleased_changes)_._optimizations_')">  OPTIMIZATIONS
			
--- a/docs/mapred_tutorial.html
+++ b/docs/mapred_tutorial.html
@@ -1872,7 +1872,8 @@ document.write("Last Published: " + document.lastModified);
 
				 <p>
			
 
				 <span class="codefrag">DistributedCache</span> can be used to distribute simple, 
			
 
				           read-only data/text files and more complex types such as archives and
			
 
				-          jars. Archives (zip files) are <em>un-archived</em> at the slave nodes.
			
 
				+          jars. Archives (zip, tar, tgz and tar.gz files) are 
			
 
				+          <em>un-archived</em> at the slave nodes.
			
 
				           Optionally users can also direct the <span class="codefrag">DistributedCache</span> to 
			
 
				           <em>symlink</em> the cached file(s) into the <span class="codefrag">current working 
			
 
				           directory</span> of the task via the 
			
--- a/docs/mapred_tutorial.pdf
+++ b/docs/mapred_tutorial.pdf
--- a/src/docs/src/documentation/content/xdocs/mapred_tutorial.xml
+++ b/src/docs/src/documentation/content/xdocs/mapred_tutorial.xml
@@ -1416,7 +1416,8 @@
 
				 
			
 
				           <p><code>DistributedCache</code> can be used to distribute simple, 
			
 
				           read-only data/text files and more complex types such as archives and
			
 
				-          jars. Archives (zip files) are <em>un-archived</em> at the slave nodes.
			
 
				+          jars. Archives (zip, tar, tgz and tar.gz files) are 
			
 
				+          <em>un-archived</em> at the slave nodes.
			
 
				           Optionally users can also direct the <code>DistributedCache</code> to 
			
 
				           <em>symlink</em> the cached file(s) into the <code>current working 
			
 
				           directory</code> of the task via the 
			
--- a/src/java/org/apache/hadoop/filecache/DistributedCache.java
+++ b/src/java/org/apache/hadoop/filecache/DistributedCache.java
@@ -50,11 +50,11 @@ import java.net.URI;
 
				  *
			
 
				  * <p><code>DistributedCache</code> can be used to distribute simple, read-only
			
 
				  * data/text files and/or more complex types such as archives, jars etc. 
			
 
				- * Archives (zip files) are un-archived at the slave nodes. Jars maybe be 
			
 
				- * optionally added to the classpath of the tasks, a rudimentary software
			
 
				- * distribution mechanism.  Files have execution permissions. Optionally users 
			
 
				- * can also direct it to symlink the distributed cache file(s) into 
			
 
				- * the working directory of the task.</p>
			
 
				+ * Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. 
			
 
				+ * Jars may be optionally added to the classpath of the tasks, a rudimentary 
			
 
				+ * software distribution mechanism.  Files have execution permissions.
			
 
				+ * Optionally users can also direct it to symlink the distributed cache file(s)
			
 
				+ * into the working directory of the task.</p>
			
 
				  * 
			
 
				  * <p><code>DistributedCache</code> tracks modification timestamps of the cache 
			
 
				  * files. Clearly the cache files should not be modified by the application 
			
@@ -70,6 +70,9 @@ import java.net.URI;
 
				  *     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
			
 
				  *     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
			
 
				  *     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
			
 
				+ *     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
			
 
				+ *     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
			
 
				+ *     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
			
 
				  *     
			
 
				  *     2. Setup the application's <code>JobConf</code>:
			
 
				  *     
			
@@ -78,7 +81,10 @@ import java.net.URI;
 
				  *                                   job);
			
 
				  *     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
			
 
				  *     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
			
 
				- *
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
			
 
				+ *     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
			
 
				+ *     
			
 
				  *     3. Use the cached files in the {@link Mapper} or {@link Reducer}:
			
 
				  *     
			
 
				  *     public static class MapClass extends MapReduceBase  
			
@@ -129,9 +135,11 @@ public class DistributedCache {
 
				    * @param conf The Confguration file which contains the filesystem
			
 
				    * @param baseDir The base cache Dir where you wnat to localize the files/archives
			
 
				    * @param fileStatus The file status on the dfs.
			
 
				-   * @param isArchive if the cache is an archive or a file. In case it is an archive
			
 
				-   *  with a .zip or .jar extension it will be unzipped/unjarred automatically 
			
 
				-   *  and the directory where the archive is unjarred is returned as the Path.
			
 
				+   * @param isArchive if the cache is an archive or a file. In case it is an
			
 
				+   *  archive with a .zip or .jar or .tar or .tgz or .tar.gz extension it will
			
 
				+   *  be unzipped/unjarred/untarred automatically 
			
 
				+   *  and the directory where the archive is unzipped/unjarred/untarred is
			
 
				+   *  returned as the Path.
			
 
				    *  In case of a file, the path to the file is returned
			
 
				    * @param confFileStamp this is the hdfs file modification timestamp to verify that the 
			
 
				    * file to be cached hasn't changed since the job started
			
@@ -185,9 +193,11 @@ public class DistributedCache {
 
				    * being used in the Configuration
			
 
				    * @param conf The Confguration file which contains the filesystem
			
 
				    * @param baseDir The base cache Dir where you wnat to localize the files/archives
			
 
				-   * @param isArchive if the cache is an archive or a file. In case it is an archive
			
 
				-   *  with a .zip or .jar extension it will be unzipped/unjarred automatically 
			
 
				-   *  and the directory where the archive is unjarred is returned as the Path.
			
 
				+   * @param isArchive if the cache is an archive or a file. In case it is an 
			
 
				+   *  archive with a .zip or .jar or .tar or .tgz or .tar.gz extension it will 
			
 
				+   *  be unzipped/unjarred/untarred automatically 
			
 
				+   *  and the directory where the archive is unzipped/unjarred/untarred 
			
 
				+   *  is returned as the Path.
			
 
				    *  In case of a file, the path to the file is returned
			
 
				    * @param confFileStamp this is the hdfs file modification timestamp to verify that the 
			
 
				    * file to be cached hasn't changed since the job started
			
@@ -331,13 +341,14 @@ public class DistributedCache {
 
				       fs.copyToLocalFile(new Path(cacheId), parchive);
			
 
				       if (isArchive) {
			
 
				         String tmpArchive = parchive.toString().toLowerCase();
			
 
				+        File srcFile = new File(parchive.toString());
			
 
				+        File destDir = new File(parchive.getParent().toString());
			
 
				         if (tmpArchive.endsWith(".jar")) {
			
 
				-          RunJar.unJar(new File(parchive.toString()), new File(parchive
			
 
				-                                                               .getParent().toString()));
			
 
				+          RunJar.unJar(srcFile, destDir);
			
 
				         } else if (tmpArchive.endsWith(".zip")) {
			
 
				-          FileUtil.unZip(new File(parchive.toString()), new File(parchive
			
 
				-                                                                 .getParent().toString()));
			
 
				-
			
 
				+          FileUtil.unZip(srcFile, destDir);
			
 
				+        } else if (isTarFile(tmpArchive)) {
			
 
				+          FileUtil.unTar(srcFile, destDir);
			
 
				         }
			
 
				         // else will not do anyhting
			
 
				         // and copy the file into the dir as it is
			
@@ -373,6 +384,11 @@ public class DistributedCache {
 
				     }
			
 
				   }
			
 
				 
			
 
				+  private static boolean isTarFile(String filename) {
			
 
				+    return (filename.endsWith(".tgz") || filename.endsWith(".tar.gz") ||
			
 
				+           filename.endsWith(".tar"));
			
 
				+  }
			
 
				+  
			
 
				   // Checks if the cache has already been localized and is fresh
			
 
				   private static boolean ifExistsAndFresh(Configuration conf, FileSystem fs, 
			
 
				                                           URI cache, long confFileStamp, 
			
--- a/src/java/org/apache/hadoop/fs/FileUtil.java
+++ b/src/java/org/apache/hadoop/fs/FileUtil.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.conf.Configuration;
 
				 import org.apache.hadoop.io.IOUtils;
			
 
				 import org.apache.hadoop.util.StringUtils;
			
 
				 import org.apache.hadoop.util.Shell;
			
 
				+import org.apache.hadoop.util.Shell.ShellCommandExecutor;
			
 
				 
			
 
				 /**
			
 
				  * A collection of file-processing util methods
			
@@ -453,7 +454,51 @@ public class FileUtil {
 
				       zipFile.close();
			
 
				     }
			
 
				   }
			
 
				-  
			
 
				+
			
 
				+  /**
			
 
				+   * Given a Tar File as input it will untar the file in a the untar directory
			
 
				+   * passed as the second parameter
			
 
				+   * 
			
 
				+   * This utility will untar ".tar" files and ".tar.gz","tgz" files.
			
 
				+   *  
			
 
				+   * @param inFile The tar file as input. 
			
 
				+   * @param untarDir The untar directory where to untar the tar file.
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  public static void unTar(File inFile, File untarDir) throws IOException {
			
 
				+    if (!untarDir.mkdirs()) {           
			
 
				+      if (!untarDir.isDirectory()) {
			
 
				+        throw new IOException("Mkdirs failed to create " + untarDir);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    StringBuffer untarCommand = new StringBuffer();
			
 
				+    boolean gzipped = inFile.toString().endsWith("gz");
			
 
				+    if (gzipped) {
			
 
				+      untarCommand.append(" gzip -dc '");
			
 
				+      untarCommand.append(FileUtil.makeShellPath(inFile));
			
 
				+      untarCommand.append("' | (");
			
 
				+    } 
			
 
				+    untarCommand.append("cd '");
			
 
				+    untarCommand.append(FileUtil.makeShellPath(untarDir)); 
			
 
				+    untarCommand.append("' ; ");
			
 
				+    untarCommand.append("tar -xf ");
			
 
				+    
			
 
				+    if (gzipped) {
			
 
				+      untarCommand.append(" -)");
			
 
				+    } else {
			
 
				+      untarCommand.append(FileUtil.makeShellPath(inFile));
			
 
				+    }
			
 
				+    String[] shellCmd = { "bash", "-c", untarCommand.toString() };
			
 
				+    ShellCommandExecutor shexec = new ShellCommandExecutor(shellCmd);
			
 
				+    shexec.execute();
			
 
				+    int exitcode = shexec.getExitCode();
			
 
				+    if (exitcode != 0) {
			
 
				+      throw new IOException("Error untarring file " + inFile + 
			
 
				+                  ". Tar process exited with exit code " + exitcode);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				   /**
			
 
				    * Class for creating hardlinks.
			
 
				    * Supports Unix, Cygwin, WindXP.
			
--- a/src/test/org/apache/hadoop/dfs/TestDFSUpgradeFromImage.java
+++ b/src/test/org/apache/hadoop/dfs/TestDFSUpgradeFromImage.java
@@ -27,7 +27,6 @@ import java.util.TreeMap;
 
				 import java.util.zip.CRC32;
			
 
				 
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				-import org.apache.hadoop.util.Shell;
			
 
				 import org.apache.hadoop.fs.FSInputStream;
			
 
				 import org.apache.hadoop.fs.FileUtil;
			
 
				 import org.apache.hadoop.dfs.FSConstants.StartupOption;
			
@@ -72,12 +71,7 @@ public class TestDFSUpgradeFromImage extends TestCase {
 
				     if ( dfsDir.exists() && !FileUtil.fullyDelete(dfsDir) ) {
			
 
				       throw new IOException("Could not delete dfs directory '" + dfsDir + "'");
			
 
				     }
			
 
				-    String cmd = "gzip -dc '" + FileUtil.makeShellPath(tarFile) + "' | (cd '" +
			
 
				-                 FileUtil.makeShellPath(dataDir) + "' ; tar -xf -)";
			
 
				-    LOG.info("Unpacking the tar file. Cmd : " + cmd);
			
 
				-    String[] shellCmd = { "bash", "-c", cmd };
			
 
				-    Shell.execCommand(shellCmd);
			
 
				-    
			
 
				+    FileUtil.unTar(new File(tarFile), new File(dataDir));
			
 
				     //Now read the reference info
			
 
				     
			
 
				     BufferedReader reader = new BufferedReader( 
			
--- a/src/test/org/apache/hadoop/mapred/MRCaching.java
+++ b/src/test/org/apache/hadoop/mapred/MRCaching.java
@@ -175,6 +175,9 @@ public class MRCaching {
 
				     Path txtPath = new Path(localPath, new Path("test.txt"));
			
 
				     Path jarPath = new Path(localPath, new Path("test.jar"));
			
 
				     Path zipPath = new Path(localPath, new Path("test.zip"));
			
 
				+    Path tarPath = new Path(localPath, new Path("test.tgz"));
			
 
				+    Path tarPath1 = new Path(localPath, new Path("test.tar.gz"));
			
 
				+    Path tarPath2 = new Path(localPath, new Path("test.tar"));
			
 
				     Path cachePath = new Path(cacheDir);
			
 
				     fs.delete(cachePath, true);
			
 
				     if (!fs.mkdirs(cachePath)) {
			
@@ -183,13 +186,23 @@ public class MRCaching {
 
				     fs.copyFromLocalFile(txtPath, cachePath);
			
 
				     fs.copyFromLocalFile(jarPath, cachePath);
			
 
				     fs.copyFromLocalFile(zipPath, cachePath);
			
 
				+    fs.copyFromLocalFile(tarPath, cachePath);
			
 
				+    fs.copyFromLocalFile(tarPath1, cachePath);
			
 
				+    fs.copyFromLocalFile(tarPath2, cachePath);
			
 
				     // setting the cached archives to zip, jar and simple text files
			
 
				     URI uri1 = fs.getUri().resolve(cachePath + "/test.jar");
			
 
				     URI uri2 = fs.getUri().resolve(cachePath + "/test.zip");
			
 
				     URI uri3 = fs.getUri().resolve(cachePath + "/test.txt");
			
 
				+    URI uri4 = fs.getUri().resolve(cachePath + "/test.tgz");
			
 
				+    URI uri5 = fs.getUri().resolve(cachePath + "/test.tar.gz");
			
 
				+    URI uri6 = fs.getUri().resolve(cachePath + "/test.tar");
			
 
				+
			
 
				     DistributedCache.addCacheArchive(uri1, conf);
			
 
				     DistributedCache.addCacheArchive(uri2, conf);
			
 
				     DistributedCache.addCacheFile(uri3, conf);
			
 
				+    DistributedCache.addCacheArchive(uri4, conf);
			
 
				+    DistributedCache.addCacheArchive(uri5, conf);
			
 
				+    DistributedCache.addCacheArchive(uri6, conf);
			
 
				     RunningJob job = JobClient.runJob(conf);
			
 
				     int count = 0;
			
 
				     // after the job ran check to see if the the input from the localized cache
			
@@ -208,7 +221,7 @@ public class MRCaching {
 
				       }
			
 
				       file.close();
			
 
				     }
			
 
				-    if (count != 3)
			
 
				+    if (count != 6)
			
 
				       return new TestResult(job, false);
			
 
				 
			
 
				     return new TestResult(job, true);
			
--- a/src/test/org/apache/hadoop/mapred/test.tar
+++ b/src/test/org/apache/hadoop/mapred/test.tar
--- a/src/test/org/apache/hadoop/mapred/test.tar.gz
+++ b/src/test/org/apache/hadoop/mapred/test.tar.gz
--- a/src/test/org/apache/hadoop/mapred/test.tgz
+++ b/src/test/org/apache/hadoop/mapred/test.tgz