14 years ago · c4504a2c4c
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -97,6 +97,9 @@ Release 0.20.205.0 - unreleased
 
				     HDFS-1202. DataBlockScanner throws NPE when updated before initialized. 
			
 
				     (Todd Lipcon)
			
 
				 
			
 
				+    HADOOP-7539. merge hadoop archive goodness from trunk to .20 (John George 
			
 
				+    via mahadev)
			
 
				+
			
 
				 Release 0.20.204.0 - 2011-8-25
			
 
				 
			
 
				   NEW FEATURES
			
--- a/src/core/org/apache/hadoop/fs/HarFileSystem.java
+++ b/src/core/org/apache/hadoop/fs/HarFileSystem.java
@@ -19,10 +19,16 @@ package org.apache.hadoop.fs;
 
				 
			
 
				 import java.io.FileNotFoundException;
			
 
				 import java.io.IOException;
			
 
				+import java.io.UnsupportedEncodingException;
			
 
				 import java.net.URI;
			
 
				 import java.net.URISyntaxException;
			
 
				+import java.net.URLDecoder;
			
 
				 import java.util.ArrayList;
			
 
				+import java.util.EnumSet;
			
 
				 import java.util.List;
			
 
				+import java.util.Map;
			
 
				+import java.util.TreeMap;
			
 
				+import java.util.HashMap;
			
 
				 
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				 import org.apache.hadoop.fs.permission.FsPermission;
			
@@ -44,23 +50,21 @@ import org.apache.hadoop.util.Progressable;
 
				  */
			
 
				 
			
 
				 public class HarFileSystem extends FilterFileSystem {
			
 
				-  public static final int VERSION = 1;
			
 
				+  public static final int VERSION = 3;
			
 
				+
			
 
				+  private static final Map<URI, HarMetaData> harMetaCache = new HashMap<URI, HarMetaData>();
			
 
				+
			
 
				   // uri representation of this Har filesystem
			
 
				   private URI uri;
			
 
				-  // the version of this har filesystem
			
 
				-  private int version;
			
 
				-  // underlying uri 
			
 
				-  private URI underLyingURI;
			
 
				   // the top level path of the archive
			
 
				   // in the underlying file system
			
 
				   private Path archivePath;
			
 
				-  // the masterIndex of the archive
			
 
				-  private Path masterIndex;
			
 
				-  // the index file 
			
 
				-  private Path archiveIndex;
			
 
				   // the har auth
			
 
				   private String harAuth;
			
 
				-  
			
 
				+
			
 
				+  // pointer into the static metadata cache
			
 
				+  private HarMetaData metadata;
			
 
				+
			
 
				   /**
			
 
				    * public construction of harfilesystem
			
 
				    *
			
@@ -91,11 +95,12 @@ public class HarFileSystem extends FilterFileSystem {
 
				    * to be used in case not specified.
			
 
				    */
			
 
				   public void initialize(URI name, Configuration conf) throws IOException {
			
 
				-    //decode the name
			
 
				-    underLyingURI = decodeHarURI(name, conf);
			
 
				-    //  we got the right har Path- now check if this is 
			
 
				-    //truly a har filesystem
			
 
				-    Path harPath = archivePath(new Path(name.toString()));
			
 
				+    // decode the name
			
 
				+    URI underLyingURI = decodeHarURI(name, conf);
			
 
				+    // we got the right har Path- now check if this is 
			
 
				+    // truly a har filesystem
			
 
				+    Path harPath = archivePath(
			
 
				+      new Path(name.getScheme(), name.getAuthority(), name.getPath()));
			
 
				     if (harPath == null) { 
			
 
				       throw new IOException("Invalid path for the Har Filesystem. " + 
			
 
				                            name.toString());
			
@@ -103,49 +108,49 @@ public class HarFileSystem extends FilterFileSystem {
 
				     if (fs == null) {
			
 
				       fs = FileSystem.get(underLyingURI, conf);
			
 
				     }
			
 
				-    this.uri = harPath.toUri();
			
 
				-    this.archivePath = new Path(this.uri.getPath());
			
 
				-    this.harAuth = getHarAuth(this.underLyingURI);
			
 
				+    uri = harPath.toUri();
			
 
				+    archivePath = new Path(uri.getPath());
			
 
				+    harAuth = getHarAuth(underLyingURI);
			
 
				     //check for the underlying fs containing
			
 
				     // the index file
			
 
				-    this.masterIndex = new Path(archivePath, "_masterindex");
			
 
				-    this.archiveIndex = new Path(archivePath, "_index");
			
 
				-    if (!fs.exists(masterIndex) || !fs.exists(archiveIndex)) {
			
 
				+    Path masterIndexPath = new Path(archivePath, "_masterindex");
			
 
				+    Path archiveIndexPath = new Path(archivePath, "_index");
			
 
				+    if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
			
 
				       throw new IOException("Invalid path for the Har Filesystem. " +
			
 
				           "No index file in " + harPath);
			
 
				     }
			
 
				-    try{ 
			
 
				-      this.version = getHarVersion();
			
 
				-    } catch(IOException io) {
			
 
				-      throw new IOException("Unable to " +
			
 
				-          "read the version of the Har file system: " + this.archivePath);
			
 
				+
			
 
				+    metadata = harMetaCache.get(uri);
			
 
				+    if (metadata != null) {
			
 
				+      FileStatus mStat = fs.getFileStatus(masterIndexPath);
			
 
				+      FileStatus aStat = fs.getFileStatus(archiveIndexPath);
			
 
				+      if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
			
 
				+          aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
			
 
				+        // the archive has been overwritten since we last read it
			
 
				+        // remove the entry from the meta data cache
			
 
				+        metadata = null;
			
 
				+        harMetaCache.remove(uri);
			
 
				+      }
			
 
				     }
			
 
				-    if (this.version != HarFileSystem.VERSION) {
			
 
				-      throw new IOException("Invalid version " + 
			
 
				-          this.version + " expected " + HarFileSystem.VERSION);
			
 
				+    if (metadata == null) {
			
 
				+      metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
			
 
				+      metadata.parseMetaData();
			
 
				+      harMetaCache.put(uri, metadata);
			
 
				     }
			
 
				   }
			
 
				-  
			
 
				+
			
 
				   // get the version of the filesystem from the masterindex file
			
 
				-  // the version is currently not useful since its the first version 
			
 
				+  // the version is currently not useful since its the first version
			
 
				   // of archives
			
 
				-  public int getHarVersion() throws IOException { 
			
 
				-    FSDataInputStream masterIn = fs.open(masterIndex);
			
 
				-    LineReader lmaster = new LineReader(masterIn, getConf());
			
 
				-    Text line = new Text();
			
 
				-    lmaster.readLine(line);
			
 
				-    try {
			
 
				-      masterIn.close();
			
 
				-    } catch(IOException e){
			
 
				-      //disregard it.
			
 
				-      // its a read.
			
 
				+  public int getHarVersion() throws IOException {
			
 
				+    if (metadata != null) {
			
 
				+      return metadata.getVersion();
			
 
				+    }
			
 
				+    else {
			
 
				+      throw new IOException("Invalid meta data for the Har Filesystem");
			
 
				     }
			
 
				-    String versionLine = line.toString();
			
 
				-    String[] arr = versionLine.split(" ");
			
 
				-    int version = Integer.parseInt(arr[0]);
			
 
				-    return version;
			
 
				   }
			
 
				-  
			
 
				+
			
 
				   /*
			
 
				    * find the parent path that is the 
			
 
				    * archive path in the path. The last
			
@@ -181,12 +186,20 @@ public class HarFileSystem extends FilterFileSystem {
 
				       return FileSystem.getDefaultUri(conf);
			
 
				     }
			
 
				     String host = rawURI.getHost();
			
 
				-    String[] str = host.split("-", 2);
			
 
				-    if (str[0] == null) {
			
 
				-      throw new IOException("URI: " + rawURI + " is an invalid Har URI.");
			
 
				+    if (host == null) {
			
 
				+      throw new IOException("URI: " + rawURI
			
 
				+          + " is an invalid Har URI since host==null."
			
 
				+          + "  Expecting har://<scheme>-<host>/<path>.");
			
 
				     }
			
 
				-    String underLyingScheme = str[0];
			
 
				-    String underLyingHost = (str.length > 1)? str[1]:null;
			
 
				+    int i = host.indexOf('-');
			
 
				+    if (i < 0) {
			
 
				+      throw new IOException("URI: " + rawURI
			
 
				+          + " is an invalid Har URI since '-' not found."
			
 
				+          + "  Expecting har://<scheme>-<host>/<path>.");
			
 
				+    }
			
 
				+    final String underLyingScheme = host.substring(0, i);
			
 
				+    i++;
			
 
				+    final String underLyingHost = i == host.length()? null: host.substring(i);
			
 
				     int underLyingPort = rawURI.getPort();
			
 
				     String auth = (underLyingHost == null && underLyingPort == -1)?
			
 
				                   null:(underLyingHost+":"+underLyingPort);
			
@@ -203,7 +216,21 @@ public class HarFileSystem extends FilterFileSystem {
 
				     }
			
 
				     return tmp;
			
 
				   }
			
 
				-  
			
 
				+
			
 
				+  private static String decodeString(String str)
			
 
				+    throws UnsupportedEncodingException {
			
 
				+    return URLDecoder.decode(str, "UTF-8");
			
 
				+  }
			
 
				+
			
 
				+  private String decodeFileName(String fname) 
			
 
				+    throws UnsupportedEncodingException {
			
 
				+    int version = metadata.getVersion();
			
 
				+    if (version == 2 || version == 3){
			
 
				+      return decodeString(fname);
			
 
				+    }
			
 
				+    return fname;
			
 
				+  }
			
 
				+
			
 
				   /**
			
 
				    * return the top level archive.
			
 
				    */
			
@@ -274,16 +301,19 @@ public class HarFileSystem extends FilterFileSystem {
 
				   // string manipulation is not good - so
			
 
				   // just use the path api to do it.
			
 
				   private Path makeRelative(String initial, Path p) {
			
 
				+    String scheme = this.uri.getScheme();
			
 
				+    String authority = this.uri.getAuthority();
			
 
				     Path root = new Path(Path.SEPARATOR);
			
 
				     if (root.compareTo(p) == 0)
			
 
				-      return new Path(initial);
			
 
				+      return new Path(scheme, authority, initial);
			
 
				     Path retPath = new Path(p.getName());
			
 
				     Path parent = p.getParent();
			
 
				     for (int i=0; i < p.depth()-1; i++) {
			
 
				       retPath = new Path(parent.getName(), retPath);
			
 
				       parent = parent.getParent();
			
 
				     }
			
 
				-    return new Path(initial, retPath.toString());
			
 
				+    return new Path(new Path(scheme, authority, initial),
			
 
				+      retPath.toString());
			
 
				   }
			
 
				   
			
 
				   /* this makes a path qualified in the har filesystem
			
@@ -304,52 +334,74 @@ public class HarFileSystem extends FilterFileSystem {
 
				     //change this to Har uri 
			
 
				     return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
			
 
				   }
			
 
				+
			
 
				+  /**
			
 
				+   * Fix offset and length of block locations.
			
 
				+   * Note that this method modifies the original array.
			
 
				+   * @param locations block locations of har part file
			
 
				+   * @param start the start of the desired range in the contained file
			
 
				+   * @param len the length of the desired range
			
 
				+   * @param fileOffsetInHar the offset of the desired file in the har part file
			
 
				+   * @return block locations with fixed offset and length
			
 
				+   */  
			
 
				+  static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
			
 
				+                                          long start,
			
 
				+                                          long len,
			
 
				+                                          long fileOffsetInHar) {
			
 
				+    // offset 1 past last byte of desired range
			
 
				+    long end = start + len;
			
 
				+
			
 
				+    for (BlockLocation location : locations) {
			
 
				+      // offset of part block relative to beginning of desired file
			
 
				+      // (may be negative if file starts in this part block)
			
 
				+      long harBlockStart = location.getOffset() - fileOffsetInHar;
			
 
				+      // offset 1 past last byte of har block relative to beginning of
			
 
				+      // desired file
			
 
				+      long harBlockEnd = harBlockStart + location.getLength();
			
 
				+      
			
 
				+      if (start > harBlockStart) {
			
 
				+        // desired range starts after beginning of this har block
			
 
				+        // fix offset to beginning of relevant range (relative to desired file)
			
 
				+        location.setOffset(start);
			
 
				+        // fix length to relevant portion of har block
			
 
				+        location.setLength(location.getLength() - (start - harBlockStart));
			
 
				+      } else {
			
 
				+        // desired range includes beginning of this har block
			
 
				+        location.setOffset(harBlockStart);
			
 
				+      }
			
 
				+      
			
 
				+      if (harBlockEnd > end) {
			
 
				+        // range ends before end of this har block
			
 
				+        // fix length to remove irrelevant portion at the end
			
 
				+        location.setLength(location.getLength() - (harBlockEnd - end));
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    return locations;
			
 
				+  }
			
 
				   
			
 
				   /**
			
 
				-   * get block locations from the underlying fs
			
 
				+   * Get block locations from the underlying fs and fix their
			
 
				+   * offsets and lengths.
			
 
				    * @param file the input filestatus to get block locations
			
 
				-   * @param start the start in the file
			
 
				-   * @param len the length in the file
			
 
				+   * @param start the start of the desired range in the contained file
			
 
				+   * @param len the length of the desired range
			
 
				    * @return block locations for this segment of file
			
 
				    * @throws IOException
			
 
				    */
			
 
				   @Override
			
 
				   public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
			
 
				-      long len) throws IOException {
			
 
				-    // need to look up the file in the underlying fs
			
 
				-    // look up the index 
			
 
				-    
			
 
				-    // make sure this is a prt of this har filesystem
			
 
				-    Path p = makeQualified(file.getPath());
			
 
				-    Path harPath = getPathInHar(p);
			
 
				-    String line = fileStatusInIndex(harPath);
			
 
				-    if (line == null)  {
			
 
				-      throw new FileNotFoundException("File " + file.getPath() + " not found");
			
 
				-    }
			
 
				-    HarStatus harStatus = new HarStatus(line);
			
 
				-    if (harStatus.isDir()) {
			
 
				-      return new BlockLocation[0];
			
 
				-    }
			
 
				-    FileStatus fsFile = fs.getFileStatus(new Path(archivePath,
			
 
				-        harStatus.getPartName()));
			
 
				-    BlockLocation[] rawBlocks = fs.getFileBlockLocations(fsFile, 
			
 
				-        harStatus.getStartIndex() + start, len);
			
 
				-    return fakeBlockLocations(rawBlocks, harStatus.getStartIndex());
			
 
				-  }
			
 
				-  
			
 
				-  /**
			
 
				-   * fake the rawblocks since map reduce uses the block offsets to 
			
 
				-   * fo some computations regarding the blocks
			
 
				-   * @param rawBlocks the raw blocks returned by the filesystem
			
 
				-   * @return faked blocks with changed offsets.
			
 
				-   */
			
 
				-  private BlockLocation[] fakeBlockLocations(BlockLocation[] rawBlocks, 
			
 
				-		  long startIndex) {
			
 
				-	for (BlockLocation block : rawBlocks) {
			
 
				-		long rawOffset = block.getOffset();
			
 
				-		block.setOffset(rawOffset - startIndex);
			
 
				-	}
			
 
				-	return rawBlocks;
			
 
				+                                               long len) throws IOException {
			
 
				+    HarStatus hstatus = getFileHarStatus(file.getPath());
			
 
				+    Path partPath = new Path(archivePath, hstatus.getPartName());
			
 
				+    FileStatus partStatus = metadata.getPartFileStatus(partPath);
			
 
				+
			
 
				+    // get all part blocks that overlap with the desired file blocks
			
 
				+    BlockLocation[] locations = 
			
 
				+      fs.getFileBlockLocations(partStatus,
			
 
				+                               hstatus.getStartIndex() + start, len);
			
 
				+
			
 
				+    return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
			
 
				   }
			
 
				   
			
 
				   /**
			
@@ -378,97 +430,136 @@ public class HarFileSystem extends FilterFileSystem {
 
				     public int endHash;
			
 
				   }
			
 
				   
			
 
				-  // make sure that this harPath is relative to the har filesystem
			
 
				-  // this only works for relative paths. This returns the line matching
			
 
				-  // the file in the index. Returns a null if there is not matching 
			
 
				-  // filename in the index file.
			
 
				-  private String fileStatusInIndex(Path harPath) throws IOException {
			
 
				-    // read the index file 
			
 
				-    int hashCode = getHarHash(harPath);
			
 
				-    // get the master index to find the pos 
			
 
				-    // in the index file
			
 
				-    FSDataInputStream in = fs.open(masterIndex);
			
 
				-    FileStatus masterStat = fs.getFileStatus(masterIndex);
			
 
				-    LineReader lin = new LineReader(in, getConf());
			
 
				-    Text line = new Text();
			
 
				-    long read = lin.readLine(line);
			
 
				-   //ignore the first line. this is the header of the index files
			
 
				-    String[] readStr = null;
			
 
				-    List<Store> stores = new ArrayList<Store>();
			
 
				-    while(read < masterStat.getLen()) {
			
 
				-      int b = lin.readLine(line);
			
 
				-      read += b;
			
 
				-      readStr = line.toString().split(" ");
			
 
				-      int startHash = Integer.parseInt(readStr[0]);
			
 
				-      int endHash  = Integer.parseInt(readStr[1]);
			
 
				-      if (startHash <= hashCode && hashCode <= endHash) {
			
 
				-        stores.add(new Store(Long.parseLong(readStr[2]), 
			
 
				-            Long.parseLong(readStr[3]), startHash,
			
 
				-            endHash));
			
 
				-      }
			
 
				-      line.clear();
			
 
				+  /**
			
 
				+   * Get filestatuses of all the children of a given directory. This just reads
			
 
				+   * through index file and reads line by line to get all statuses for children
			
 
				+   * of a directory. Its a brute force way of getting all such filestatuses
			
 
				+   * 
			
 
				+   * @param parent
			
 
				+   *          the parent path directory
			
 
				+   * @param statuses
			
 
				+   *          the list to add the children filestatuses to
			
 
				+   * @param children
			
 
				+   *          the string list of children for this parent
			
 
				+   * @param archiveIndexStat
			
 
				+   *          the archive index filestatus
			
 
				+   */
			
 
				+  private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
			
 
				+      List<String> children) throws IOException {
			
 
				+    String parentString = parent.getName();
			
 
				+    if (!parentString.endsWith(Path.SEPARATOR)){
			
 
				+        parentString += Path.SEPARATOR;
			
 
				     }
			
 
				-    try {
			
 
				-      lin.close();
			
 
				-    } catch(IOException io){
			
 
				-      // do nothing just a read.
			
 
				-    }
			
 
				-    FSDataInputStream aIn = fs.open(archiveIndex);
			
 
				-    LineReader aLin;
			
 
				-    String retStr = null;
			
 
				-    // now start reading the real index file
			
 
				-    for (Store s: stores) {
			
 
				-      read = 0;
			
 
				-      aIn.seek(s.begin);
			
 
				-      aLin = new LineReader(aIn, getConf());
			
 
				-      while (read + s.begin < s.end) {
			
 
				-        int tmp = aLin.readLine(line);
			
 
				-        read += tmp;
			
 
				-        String lineFeed = line.toString();
			
 
				-        String[] parsed = lineFeed.split(" ");
			
 
				-        if (harPath.compareTo(new Path(parsed[0])) == 0) {
			
 
				-          // bingo!
			
 
				-          retStr = lineFeed;
			
 
				-          break;
			
 
				+    Path harPath = new Path(parentString);
			
 
				+    int harlen = harPath.depth();
			
 
				+    final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
			
 
				+
			
 
				+    for (HarStatus hstatus : metadata.archive.values()) {
			
 
				+      String child = hstatus.getName();
			
 
				+      if ((child.startsWith(parentString))) {
			
 
				+        Path thisPath = new Path(child);
			
 
				+        if (thisPath.depth() == harlen + 1) {
			
 
				+          statuses.add(toFileStatus(hstatus, cache));
			
 
				         }
			
 
				-        line.clear();
			
 
				       }
			
 
				-      if (retStr != null)
			
 
				-        break;
			
 
				     }
			
 
				-    try {
			
 
				-      aIn.close();
			
 
				-    } catch(IOException io) {
			
 
				-      //do nothing
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Combine the status stored in the index and the underlying status. 
			
 
				+   * @param h status stored in the index
			
 
				+   * @param cache caching the underlying file statuses
			
 
				+   * @return the combined file status
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private FileStatus toFileStatus(HarStatus h,
			
 
				+      Map<String, FileStatus> cache) throws IOException {
			
 
				+    FileStatus underlying = null;
			
 
				+    if (cache != null) {
			
 
				+      underlying = cache.get(h.partName);
			
 
				+    }
			
 
				+    if (underlying == null) {
			
 
				+      final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
			
 
				+      underlying = fs.getFileStatus(p);
			
 
				+      if (cache != null) {
			
 
				+        cache.put(h.partName, underlying);
			
 
				+      }
			
 
				     }
			
 
				-    return retStr;
			
 
				+
			
 
				+    long modTime = 0;
			
 
				+    int version = metadata.getVersion();
			
 
				+    if (version < 3) {
			
 
				+      modTime = underlying.getModificationTime();
			
 
				+    } else if (version == 3) {
			
 
				+      modTime = h.getModificationTime();
			
 
				+    }
			
 
				+
			
 
				+    return new FileStatus(
			
 
				+        h.isDir()? 0L: h.getLength(),
			
 
				+        h.isDir(),
			
 
				+        underlying.getReplication(),
			
 
				+        underlying.getBlockSize(),
			
 
				+        modTime,
			
 
				+        underlying.getAccessTime(),
			
 
				+        underlying.getPermission(),
			
 
				+        underlying.getOwner(),
			
 
				+        underlying.getGroup(),
			
 
				+        makeRelative(this.uri.getPath(), new Path(h.name)));
			
 
				   }
			
 
				-  
			
 
				+
			
 
				   // a single line parser for hadoop archives status 
			
 
				   // stored in a single line in the index files 
			
 
				   // the format is of the form 
			
 
				   // filename "dir"/"file" partFileName startIndex length 
			
 
				   // <space seperated children>
			
 
				-  private static class HarStatus {
			
 
				+  private class HarStatus {
			
 
				     boolean isDir;
			
 
				     String name;
			
 
				     List<String> children;
			
 
				     String partName;
			
 
				     long startIndex;
			
 
				     long length;
			
 
				-    public HarStatus(String harString) {
			
 
				+    long modificationTime = 0;
			
 
				+
			
 
				+    public HarStatus(String harString) throws UnsupportedEncodingException {
			
 
				       String[] splits = harString.split(" ");
			
 
				-      this.name = splits[0];
			
 
				+      this.name = decodeFileName(splits[0]);
			
 
				       this.isDir = "dir".equals(splits[1]) ? true: false;
			
 
				       // this is equal to "none" if its a directory
			
 
				       this.partName = splits[2];
			
 
				       this.startIndex = Long.parseLong(splits[3]);
			
 
				       this.length = Long.parseLong(splits[4]);
			
 
				+
			
 
				+      int version = metadata.getVersion();
			
 
				+      String[] propSplits = null;
			
 
				+      // propSplits is used to retrieve the metainformation that Har versions
			
 
				+      // 1 & 2 missed (modification time, permission, owner group).
			
 
				+      // These fields are stored in an encoded string placed in different
			
 
				+      // locations depending on whether it's a file or directory entry.
			
 
				+      // If it's a directory, the string will be placed at the partName
			
 
				+      // location (directories have no partName because they don't have data
			
 
				+      // to be stored). This is done because the number of fields in a
			
 
				+      // directory entry is unbounded (all children are listed at the end)
			
 
				+      // If it's a file, the string will be the last field.
			
 
				       if (isDir) {
			
 
				+        if (version == 3){
			
 
				+          propSplits = decodeString(this.partName).split(" ");
			
 
				+        }
			
 
				         children = new ArrayList<String>();
			
 
				         for (int i = 5; i < splits.length; i++) {
			
 
				-          children.add(splits[i]);
			
 
				+          children.add(decodeFileName(splits[i]));
			
 
				         }
			
 
				+      } else if (version == 3) {
			
 
				+        propSplits = decodeString(splits[5]).split(" ");
			
 
				+      }
			
 
				+
			
 
				+      if (propSplits != null && propSplits.length >= 4) {
			
 
				+        modificationTime = Long.parseLong(propSplits[0]);
			
 
				+        // the fields below are stored in the file but are currently not used
			
 
				+        // by HarFileSystem
			
 
				+        // permission = new FsPermission(Short.parseShort(propSplits[1]));
			
 
				+        // owner = decodeString(propSplits[2]);
			
 
				+        // group = decodeString(propSplits[3]);
			
 
				       }
			
 
				     }
			
 
				     public boolean isDir() {
			
@@ -494,6 +585,9 @@ public class HarFileSystem extends FilterFileSystem {
 
				     public long getLength() {
			
 
				       return length;
			
 
				     }
			
 
				+    public long getModificationTime() {
			
 
				+      return modificationTime;
			
 
				+    }
			
 
				   }
			
 
				   
			
 
				   /**
			
@@ -507,7 +601,11 @@ public class HarFileSystem extends FilterFileSystem {
 
				    */
			
 
				   @Override
			
 
				   public FileStatus getFileStatus(Path f) throws IOException {
			
 
				-    FileStatus archiveStatus = fs.getFileStatus(archiveIndex);
			
 
				+    HarStatus hstatus = getFileHarStatus(f);
			
 
				+    return toFileStatus(hstatus, null);
			
 
				+  }
			
 
				+
			
 
				+  private HarStatus getFileHarStatus(Path f) throws IOException {
			
 
				     // get the fs DataInputStream for the underlying file
			
 
				     // look up the index.
			
 
				     Path p = makeQualified(f);
			
@@ -515,19 +613,18 @@ public class HarFileSystem extends FilterFileSystem {
 
				     if (harPath == null) {
			
 
				       throw new IOException("Invalid file name: " + f + " in " + uri);
			
 
				     }
			
 
				-    String readStr = fileStatusInIndex(harPath);
			
 
				-    if (readStr == null) {
			
 
				+    HarStatus hstatus = metadata.archive.get(harPath);
			
 
				+    if (hstatus == null) {
			
 
				       throw new FileNotFoundException("File: " +  f + " does not exist in " + uri);
			
 
				     }
			
 
				-    HarStatus hstatus = null;
			
 
				-    hstatus = new HarStatus(readStr);
			
 
				-    return new FileStatus(hstatus.isDir()?0:hstatus.getLength(), hstatus.isDir(),
			
 
				-        (int)archiveStatus.getReplication(), archiveStatus.getBlockSize(),
			
 
				-        archiveStatus.getModificationTime(), archiveStatus.getAccessTime(),
			
 
				-        new FsPermission(
			
 
				-        archiveStatus.getPermission()), archiveStatus.getOwner(), 
			
 
				-        archiveStatus.getGroup(), 
			
 
				-            makeRelative(this.uri.toString(), new Path(hstatus.name)));
			
 
				+    return hstatus;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @return null since no checksum algorithm is implemented.
			
 
				+   */
			
 
				+  public FileChecksum getFileChecksum(Path f) {
			
 
				+    return null;
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -538,17 +635,7 @@ public class HarFileSystem extends FilterFileSystem {
 
				   @Override
			
 
				   public FSDataInputStream open(Path f, int bufferSize) throws IOException {
			
 
				     // get the fs DataInputStream for the underlying file
			
 
				-    // look up the index.
			
 
				-    Path p = makeQualified(f);
			
 
				-    Path harPath = getPathInHar(p);
			
 
				-    if (harPath == null) {
			
 
				-      throw new IOException("Invalid file name: " + f + " in " + uri);
			
 
				-    }
			
 
				-    String readStr = fileStatusInIndex(harPath);
			
 
				-    if (readStr == null) {
			
 
				-      throw new FileNotFoundException(f + ": not found in " + archivePath);
			
 
				-    }
			
 
				-    HarStatus hstatus = new HarStatus(readStr); 
			
 
				+    HarStatus hstatus = getFileHarStatus(f);
			
 
				     // we got it.. woo hooo!!! 
			
 
				     if (hstatus.isDir()) {
			
 
				       throw new FileNotFoundException(f + " : not a file in " +
			
@@ -617,27 +704,18 @@ public class HarFileSystem extends FilterFileSystem {
 
				     // we will create fake filestatuses to return
			
 
				     // to the client
			
 
				     List<FileStatus> statuses = new ArrayList<FileStatus>();
			
 
				-    FileStatus archiveStatus = fs.getFileStatus(archiveIndex);
			
 
				     Path tmpPath = makeQualified(f);
			
 
				     Path harPath = getPathInHar(tmpPath);
			
 
				-    String readStr = fileStatusInIndex(harPath);
			
 
				-    if (readStr == null) {
			
 
				+    HarStatus hstatus = metadata.archive.get(harPath);
			
 
				+    if (hstatus == null) {
			
 
				       throw new FileNotFoundException("File " + f + " not found in " + archivePath);
			
 
				     }
			
 
				-    HarStatus hstatus = new HarStatus(readStr);
			
 
				-    if (!hstatus.isDir()) 
			
 
				-        statuses.add(new FileStatus(hstatus.getLength(), 
			
 
				-            hstatus.isDir(),
			
 
				-            archiveStatus.getReplication(), archiveStatus.getBlockSize(),
			
 
				-            archiveStatus.getModificationTime(), archiveStatus.getAccessTime(),
			
 
				-            new FsPermission(archiveStatus.getPermission()),
			
 
				-            archiveStatus.getOwner(), archiveStatus.getGroup(), 
			
 
				-            makeRelative(this.uri.toString(), new Path(hstatus.name))));
			
 
				-    else 
			
 
				-      for (String child: hstatus.children) {
			
 
				-        FileStatus tmp = getFileStatus(new Path(tmpPath, child));
			
 
				-        statuses.add(tmp);
			
 
				-      }
			
 
				+    if (hstatus.isDir()) {
			
 
				+      fileStatusesInIndex(hstatus, statuses, hstatus.children);
			
 
				+    } else {
			
 
				+      statuses.add(toFileStatus(hstatus, null));
			
 
				+    }
			
 
				+    
			
 
				     return statuses.toArray(new FileStatus[statuses.size()]);
			
 
				   }
			
 
				   
			
@@ -879,4 +957,114 @@ public class HarFileSystem extends FilterFileSystem {
 
				         super(new HarFsInputStream(fs, p, start, length, 0));
			
 
				     }
			
 
				   }
			
 
				+
			
 
				+  private class HarMetaData {
			
 
				+    private FileSystem fs;
			
 
				+    private int version;
			
 
				+    // the masterIndex of the archive
			
 
				+    private Path masterIndexPath;
			
 
				+    // the index file 
			
 
				+    private Path archiveIndexPath;
			
 
				+
			
 
				+    private long masterIndexTimestamp;
			
 
				+    private long archiveIndexTimestamp;
			
 
				+
			
 
				+    List<Store> stores = new ArrayList<Store>();
			
 
				+    Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
			
 
				+    private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
			
 
				+
			
 
				+    public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
			
 
				+      this.fs = fs;
			
 
				+      this.masterIndexPath = masterIndexPath;
			
 
				+      this.archiveIndexPath = archiveIndexPath;
			
 
				+    }
			
 
				+
			
 
				+    public FileStatus getPartFileStatus(Path partPath) throws IOException {
			
 
				+      FileStatus status;
			
 
				+      status = partFileStatuses.get(partPath);
			
 
				+      if (status == null) {
			
 
				+        status = fs.getFileStatus(partPath);
			
 
				+        partFileStatuses.put(partPath, status);
			
 
				+      }
			
 
				+      return status;
			
 
				+    }
			
 
				+
			
 
				+    public long getMasterIndexTimestamp() {
			
 
				+      return masterIndexTimestamp;
			
 
				+    }
			
 
				+
			
 
				+    public long getArchiveIndexTimestamp() {
			
 
				+      return archiveIndexTimestamp;
			
 
				+    }
			
 
				+
			
 
				+    private int getVersion() {
			
 
				+      return version;
			
 
				+    }
			
 
				+
			
 
				+    private void parseMetaData() throws IOException {
			
 
				+      FSDataInputStream in = fs.open(masterIndexPath);
			
 
				+      FileStatus masterStat = fs.getFileStatus(masterIndexPath);
			
 
				+      masterIndexTimestamp = masterStat.getModificationTime();
			
 
				+      LineReader lin = new LineReader(in, getConf());
			
 
				+      Text line = new Text();
			
 
				+      long read = lin.readLine(line);
			
 
				+
			
 
				+     // the first line contains the version of the index file
			
 
				+      String versionLine = line.toString();
			
 
				+      String[] arr = versionLine.split(" ");
			
 
				+      version = Integer.parseInt(arr[0]);
			
 
				+      // make it always backwards-compatible
			
 
				+      if (this.version > HarFileSystem.VERSION) {
			
 
				+        throw new IOException("Invalid version " + 
			
 
				+            this.version + " expected " + HarFileSystem.VERSION);
			
 
				+      }
			
 
				+
			
 
				+      // each line contains a hashcode range and the index file name
			
 
				+      String[] readStr = null;
			
 
				+      while(read < masterStat.getLen()) {
			
 
				+        int b = lin.readLine(line);
			
 
				+        read += b;
			
 
				+        readStr = line.toString().split(" ");
			
 
				+        int startHash = Integer.parseInt(readStr[0]);
			
 
				+        int endHash  = Integer.parseInt(readStr[1]);
			
 
				+        stores.add(new Store(Long.parseLong(readStr[2]), 
			
 
				+            Long.parseLong(readStr[3]), startHash,
			
 
				+            endHash));
			
 
				+        line.clear();
			
 
				+      }
			
 
				+      try {
			
 
				+        // close the master index
			
 
				+        lin.close();
			
 
				+      } catch(IOException io){
			
 
				+        // do nothing just a read.
			
 
				+      }
			
 
				+
			
 
				+      FSDataInputStream aIn = fs.open(archiveIndexPath);
			
 
				+      FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
			
 
				+      archiveIndexTimestamp = archiveStat.getModificationTime();
			
 
				+      LineReader aLin;
			
 
				+      String retStr = null;
			
 
				+      // now start reading the real index file
			
 
				+      for (Store s: stores) {
			
 
				+        read = 0;
			
 
				+        aIn.seek(s.begin);
			
 
				+        aLin = new LineReader(aIn, getConf());
			
 
				+        while (read + s.begin < s.end) {
			
 
				+          int tmp = aLin.readLine(line);
			
 
				+          read += tmp;
			
 
				+          String lineFeed = line.toString();
			
 
				+          String[] parsed = lineFeed.split(" ");
			
 
				+          parsed[0] = decodeFileName(parsed[0]);
			
 
				+          archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
			
 
				+          line.clear();
			
 
				+        }
			
 
				+      }
			
 
				+      try {
			
 
				+        // close the archive index
			
 
				+        aIn.close();
			
 
				+      } catch(IOException io) {
			
 
				+        // do nothing just a read.
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				 }
			
--- a/src/test/org/apache/hadoop/fs/TestHarFileSystem.java
+++ b/src/test/org/apache/hadoop/fs/TestHarFileSystem.java
@@ -20,31 +20,26 @@ package org.apache.hadoop.fs;
 
				 
			
 
				 
			
 
				 import java.io.IOException;
			
 
				+import java.net.URI;
			
 
				 import java.util.Iterator;
			
 
				 
			
 
				+import junit.framework.TestCase;
			
 
				+
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				-import org.apache.hadoop.hdfs.MiniDFSCluster;
			
 
				-import org.apache.hadoop.fs.FSDataOutputStream;
			
 
				+import org.apache.hadoop.fs.BlockLocation;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				 import org.apache.hadoop.fs.FileSystem;
			
 
				 import org.apache.hadoop.fs.FsShell;
			
 
				+import org.apache.hadoop.fs.FSDataInputStream;
			
 
				+import org.apache.hadoop.fs.FSDataOutputStream;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.hdfs.MiniDFSCluster;
			
 
				 import org.apache.hadoop.io.LongWritable;
			
 
				 import org.apache.hadoop.io.Text;
			
 
				-import org.apache.hadoop.mapred.FileInputFormat;
			
 
				-import org.apache.hadoop.mapred.FileOutputFormat;
			
 
				-import org.apache.hadoop.mapred.JobClient;
			
 
				-import org.apache.hadoop.mapred.JobConf;
			
 
				-import org.apache.hadoop.mapred.Mapper;
			
 
				-import org.apache.hadoop.mapred.MiniMRCluster;
			
 
				-import org.apache.hadoop.mapred.OutputCollector;
			
 
				-import org.apache.hadoop.mapred.Reducer;
			
 
				-import org.apache.hadoop.mapred.Reporter;
			
 
				-import org.apache.hadoop.mapred.TextInputFormat;
			
 
				-import org.apache.hadoop.mapred.TextOutputFormat;
			
 
				+import org.apache.hadoop.mapred.*;
			
 
				 import org.apache.hadoop.tools.HadoopArchives;
			
 
				 import org.apache.hadoop.util.ToolRunner;
			
 
				-
			
 
				-import junit.framework.TestCase;
			
 
				+import org.mortbay.log.Log;
			
 
				 
			
 
				 /**
			
 
				  * test the har file system
			
@@ -53,26 +48,35 @@ import junit.framework.TestCase;
 
				  * and then run a map reduce job
			
 
				  */
			
 
				 public class TestHarFileSystem extends TestCase {
			
 
				-  private Path inputPath;
			
 
				+  private Path inputPath, inputrelPath;
			
 
				   private MiniDFSCluster dfscluster;
			
 
				   private MiniMRCluster mapred;
			
 
				   private FileSystem fs;
			
 
				-  private Path filea, fileb, filec, filed;
			
 
				+  private Path filea, fileb, filec;
			
 
				   private Path archivePath;
			
 
				   
			
 
				   protected void setUp() throws Exception {
			
 
				     super.setUp();
			
 
				-    dfscluster = new MiniDFSCluster(new JobConf(), 2, true, null);
			
 
				+    dfscluster = new MiniDFSCluster(new Configuration(), 2, true, null);
			
 
				     fs = dfscluster.getFileSystem();
			
 
				     mapred = new MiniMRCluster(2, fs.getUri().toString(), 1);
			
 
				     inputPath = new Path(fs.getHomeDirectory(), "test"); 
			
 
				+    inputrelPath = new Path(fs.getHomeDirectory().toUri().
			
 
				+        getPath().substring(1), "test");
			
 
				     filea = new Path(inputPath,"a");
			
 
				     fileb = new Path(inputPath,"b");
			
 
				-    filec = new Path(inputPath,"c");
			
 
				-    // check for har containing escape worthy characters
			
 
				-    // in there name
			
 
				-    filed = new Path(inputPath, "d%d");
			
 
				+    filec = new Path(inputPath,"c c");
			
 
				     archivePath = new Path(fs.getHomeDirectory(), "tmp");
			
 
				+    fs.mkdirs(inputPath);
			
 
				+    FSDataOutputStream out = fs.create(filea); 
			
 
				+    out.write("a".getBytes());
			
 
				+    out.close();
			
 
				+    out = fs.create(fileb);
			
 
				+    out.write("b".getBytes());
			
 
				+    out.close();
			
 
				+    out = fs.create(filec);
			
 
				+    out.write("c".getBytes());
			
 
				+    out.close();
			
 
				   }
			
 
				   
			
 
				   protected void tearDown() throws Exception {
			
@@ -111,53 +115,178 @@ public class TestHarFileSystem extends TestCase {
 
				       }
			
 
				     }
			
 
				   }
			
 
				+
			
 
				+  /* check bytes in the har output files */
			
 
				+  private void  checkBytes(Path harPath, Configuration conf) throws IOException {
			
 
				+    Path harFilea = new Path(harPath, "a");
			
 
				+    Path harFileb = new Path(harPath, "b");
			
 
				+    Path harFilec = new Path(harPath, "c c");
			
 
				+    FileSystem harFs = harFilea.getFileSystem(conf);
			
 
				+    FSDataInputStream fin = harFs.open(harFilea);
			
 
				+    byte[] b = new byte[4];
			
 
				+    int readBytes = fin.read(b);
			
 
				+    fin.close();
			
 
				+    assertTrue("strings are equal ", (b[0] == "a".getBytes()[0]));
			
 
				+    fin = harFs.open(harFileb);
			
 
				+    fin.read(b);
			
 
				+    fin.close();
			
 
				+    assertTrue("strings are equal ", (b[0] == "b".getBytes()[0]));
			
 
				+    fin = harFs.open(harFilec);
			
 
				+    fin.read(b);
			
 
				+    fin.close();
			
 
				+    assertTrue("strings are equal ", (b[0] == "c".getBytes()[0]));
			
 
				+  }
			
 
				+
			
 
				+  private void checkProperties(Path harPath, Configuration conf) throws IOException {
			
 
				+    Path harFilea = new Path(harPath, "a");
			
 
				+    Path harFileb = new Path(harPath, "b");
			
 
				+    Path harFilec = new Path(harPath, "c c");
			
 
				+    FileSystem harFs = harFilea.getFileSystem(conf);
			
 
				+
			
 
				+    Path nonharFilea = new Path(inputPath, "a");
			
 
				+    Path nonharFileb = new Path(inputPath, "b");
			
 
				+    Path nonharFilec = new Path(inputPath, "c c");
			
 
				+    FileSystem nonharFs = nonharFilea.getFileSystem(conf);
			
 
				+
			
 
				+    assertEquals("Modification times do not match for a",
			
 
				+        harFs.getFileStatus(harFilea).getModificationTime(),
			
 
				+        nonharFs.getFileStatus(nonharFilea).getModificationTime());
			
 
				+
			
 
				+    assertEquals("Modification times do not match for b",
			
 
				+        harFs.getFileStatus(harFileb).getModificationTime(),
			
 
				+        nonharFs.getFileStatus(nonharFileb).getModificationTime());
			
 
				+
			
 
				+    assertEquals("Modification times do not match for c",
			
 
				+        harFs.getFileStatus(harFilec).getModificationTime(),
			
 
				+        nonharFs.getFileStatus(nonharFilec).getModificationTime());
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * check if the block size of the part files is what we had specified
			
 
				+   */
			
 
				+  private void checkBlockSize(FileSystem fs, Path finalPath, long blockSize) throws IOException {
			
 
				+    FileStatus[] statuses = fs.globStatus(new Path(finalPath, "part-*"));
			
 
				+    for (FileStatus status: statuses) {
			
 
				+      assertTrue(status.getBlockSize() == blockSize);
			
 
				+    }
			
 
				+  }
			
 
				   
			
 
				-  public void testArchives() throws Exception {
			
 
				-    fs.mkdirs(inputPath);
			
 
				-    
			
 
				-    FSDataOutputStream out = fs.create(filea); 
			
 
				-    out.write("a".getBytes());
			
 
				-    out.close();
			
 
				-    out = fs.create(fileb);
			
 
				-    out.write("b".getBytes());
			
 
				-    out.close();
			
 
				-    out = fs.create(filec);
			
 
				-    out.write("c".getBytes());
			
 
				-    out.close();
			
 
				-    out = fs.create(filed);
			
 
				-    out.write("d".getBytes());
			
 
				-    out.close();
			
 
				+  // test archives with a -p option
			
 
				+  public void testRelativeArchives() throws Exception {
			
 
				+    fs.delete(archivePath, true);
			
 
				     Configuration conf = mapred.createJobConf();
			
 
				+    HadoopArchives har = new HadoopArchives(conf);
			
 
				+
			
 
				+    {
			
 
				+      String[] args = new String[6];
			
 
				+      args[0] = "-archiveName";
			
 
				+      args[1] = "foo1.har";
			
 
				+      args[2] = "-p";
			
 
				+      args[3] = fs.getHomeDirectory().toString();
			
 
				+      args[4] = "test";
			
 
				+      args[5] = archivePath.toString();
			
 
				+      int ret = ToolRunner.run(har, args);
			
 
				+      assertTrue("failed test", ret == 0);
			
 
				+      Path finalPath = new Path(archivePath, "foo1.har");
			
 
				+      Path fsPath = new Path(inputPath.toUri().getPath());
			
 
				+      Path filePath = new Path(finalPath, "test");
			
 
				+      // make it a har path
			
 
				+      Path harPath = new Path("har://" + filePath.toUri().getPath());
			
 
				+      assertTrue(fs.exists(new Path(finalPath, "_index")));
			
 
				+      assertTrue(fs.exists(new Path(finalPath, "_masterindex")));
			
 
				+      /*check for existence of only 1 part file, since part file size == 2GB */
			
 
				+      assertTrue(fs.exists(new Path(finalPath, "part-0")));
			
 
				+      assertTrue(!fs.exists(new Path(finalPath, "part-1")));
			
 
				+      assertTrue(!fs.exists(new Path(finalPath, "part-2")));
			
 
				+      assertTrue(!fs.exists(new Path(finalPath, "_logs")));
			
 
				+      FileStatus[] statuses = fs.listStatus(finalPath);
			
 
				+      args = new String[2];
			
 
				+      args[0] = "-ls";
			
 
				+      args[1] = harPath.toString();
			
 
				+      FsShell shell = new FsShell(conf);
			
 
				+      ret = ToolRunner.run(shell, args);
			
 
				+      // fileb and filec
			
 
				+      assertTrue(ret == 0);
			
 
				+      checkBytes(harPath, conf);
			
 
				+      checkProperties(harPath, conf);
			
 
				+      /* check block size for path files */
			
 
				+      checkBlockSize(fs, finalPath, 512 * 1024 * 1024l);
			
 
				+    }
			
 
				     
			
 
				-    // check to see if fs.har.impl.disable.cache is true
			
 
				-    boolean archivecaching = conf.getBoolean("fs.har.impl.disable.cache", false);
			
 
				-    assertTrue(archivecaching);
			
 
				+    /** now try with different block size and part file size **/
			
 
				+    {
			
 
				+      String[] args = new String[8];
			
 
				+      args[0] = "-Dhar.block.size=512";
			
 
				+      args[1] = "-Dhar.partfile.size=1";
			
 
				+      args[2] = "-archiveName";
			
 
				+      args[3] = "foo.har";
			
 
				+      args[4] = "-p";
			
 
				+      args[5] = fs.getHomeDirectory().toString();
			
 
				+      args[6] = "test";
			
 
				+      args[7] = archivePath.toString();
			
 
				+      int ret = ToolRunner.run(har, args);
			
 
				+      assertTrue("failed test", ret == 0);
			
 
				+      Path finalPath = new Path(archivePath, "foo.har");
			
 
				+      Path fsPath = new Path(inputPath.toUri().getPath());
			
 
				+      Path filePath = new Path(finalPath, "test");
			
 
				+      // make it a har path
			
 
				+      Path harPath = new Path("har://" + filePath.toUri().getPath());
			
 
				+      assertTrue(fs.exists(new Path(finalPath, "_index")));
			
 
				+      assertTrue(fs.exists(new Path(finalPath, "_masterindex")));
			
 
				+      /*check for existence of 3 part files, since part file size == 1 */
			
 
				+      assertTrue(fs.exists(new Path(finalPath, "part-0")));
			
 
				+      assertTrue(fs.exists(new Path(finalPath, "part-1")));
			
 
				+      assertTrue(fs.exists(new Path(finalPath, "part-2")));
			
 
				+      assertTrue(!fs.exists(new Path(finalPath, "_logs")));
			
 
				+      FileStatus[] statuses = fs.listStatus(finalPath);
			
 
				+      args = new String[2];
			
 
				+      args[0] = "-ls";
			
 
				+      args[1] = harPath.toString();
			
 
				+      FsShell shell = new FsShell(conf);
			
 
				+      ret = ToolRunner.run(shell, args);
			
 
				+      // fileb and filec
			
 
				+      assertTrue(ret == 0);
			
 
				+      checkBytes(harPath, conf);
			
 
				+      checkProperties(harPath, conf);
			
 
				+      checkBlockSize(fs, finalPath, 512);
			
 
				+    }
			
 
				+  }
			
 
				+ 
			
 
				+  public void testArchivesWithMapred() throws Exception {
			
 
				+    fs.delete(archivePath, true);
			
 
				+    Configuration conf = mapred.createJobConf();
			
 
				     HadoopArchives har = new HadoopArchives(conf);
			
 
				-    String[] args = new String[3];
			
 
				+    String[] args = new String[4];
			
 
				+ 
			
 
				     //check for destination not specfied
			
 
				     args[0] = "-archiveName";
			
 
				     args[1] = "foo.har";
			
 
				-    args[2] = inputPath.toString();
			
 
				+    args[2] = "-p";
			
 
				+    args[3] = "/";
			
 
				     int ret = ToolRunner.run(har, args);
			
 
				     assertTrue(ret != 0);
			
 
				-    args = new String[4];
			
 
				+    args = new String[6];
			
 
				     //check for wrong archiveName
			
 
				     args[0] = "-archiveName";
			
 
				     args[1] = "/d/foo.har";
			
 
				-    args[2] = inputPath.toString();
			
 
				-    args[3] = archivePath.toString();
			
 
				+    args[2] = "-p";
			
 
				+    args[3] = "/";
			
 
				+    args[4] = inputrelPath.toString();
			
 
				+    args[5] = archivePath.toString();
			
 
				     ret = ToolRunner.run(har, args);
			
 
				     assertTrue(ret != 0);
			
 
				-//  se if dest is a file 
			
 
				+    //  se if dest is a file 
			
 
				     args[1] = "foo.har";
			
 
				-    args[3] = filec.toString();
			
 
				+    args[5] = filec.toString();
			
 
				     ret = ToolRunner.run(har, args);
			
 
				     assertTrue(ret != 0);
			
 
				     //this is a valid run
			
 
				     args[0] = "-archiveName";
			
 
				     args[1] = "foo.har";
			
 
				-    args[2] = inputPath.toString();
			
 
				-    args[3] = archivePath.toString();
			
 
				+    args[2] = "-p";
			
 
				+    args[3] = "/";
			
 
				+    args[4] = inputrelPath.toString();
			
 
				+    args[5] = archivePath.toString();
			
 
				     ret = ToolRunner.run(har, args);
			
 
				     //checl for the existenece of the archive
			
 
				     assertTrue(ret == 0);
			
@@ -170,13 +299,16 @@ public class TestHarFileSystem extends TestCase {
 
				     String relative = fsPath.toString().substring(1);
			
 
				     Path filePath = new Path(finalPath, relative);
			
 
				     //make it a har path 
			
 
				-    Path harPath = new Path("har://" + filePath.toUri().getPath());
			
 
				+    URI uri = fs.getUri();
			
 
				+    Path harPath = new Path("har://" + "hdfs-" + uri.getHost() +":" +
			
 
				+        uri.getPort() + filePath.toUri().getPath());
			
 
				     assertTrue(fs.exists(new Path(finalPath, "_index")));
			
 
				     assertTrue(fs.exists(new Path(finalPath, "_masterindex")));
			
 
				     assertTrue(!fs.exists(new Path(finalPath, "_logs")));
			
 
				     //creation tested
			
 
				     //check if the archive is same
			
 
				     // do ls and cat on all the files
			
 
				+    
			
 
				     FsShell shell = new FsShell(conf);
			
 
				     args = new String[2];
			
 
				     args[0] = "-ls";
			
@@ -188,29 +320,28 @@ public class TestHarFileSystem extends TestCase {
 
				     // fileb and filec
			
 
				     Path harFilea = new Path(harPath, "a");
			
 
				     Path harFileb = new Path(harPath, "b");
			
 
				-    Path harFilec = new Path(harPath, "c");
			
 
				-    Path harFiled = new Path(harPath, "d%d");
			
 
				+    Path harFilec = new Path(harPath, "c c");
			
 
				     FileSystem harFs = harFilea.getFileSystem(conf);
			
 
				     FSDataInputStream fin = harFs.open(harFilea);
			
 
				     byte[] b = new byte[4];
			
 
				     int readBytes = fin.read(b);
			
 
				+    assertTrue("Empty read.", readBytes > 0);
			
 
				     fin.close();
			
 
				     assertTrue("strings are equal ", (b[0] == "a".getBytes()[0]));
			
 
				     fin = harFs.open(harFileb);
			
 
				-    fin.read(b);
			
 
				+    readBytes = fin.read(b);
			
 
				+    assertTrue("Empty read.", readBytes > 0);
			
 
				     fin.close();
			
 
				     assertTrue("strings are equal ", (b[0] == "b".getBytes()[0]));
			
 
				     fin = harFs.open(harFilec);
			
 
				-    fin.read(b);
			
 
				+    readBytes = fin.read(b);
			
 
				+    assertTrue("Empty read.", readBytes > 0);
			
 
				     fin.close();
			
 
				     assertTrue("strings are equal ", (b[0] == "c".getBytes()[0]));
			
 
				-    fin = harFs.open(harFiled);
			
 
				-    fin.read(b);
			
 
				-    fin.close();
			
 
				-    assertTrue("strings are equal ", (b[0] == "d".getBytes()[0]));
			
 
				-    
			
 
				     // ok all files match 
			
 
				     // run a map reduce job
			
 
				+    FileSystem fsHar = harPath.getFileSystem(conf);
			
 
				+    FileStatus[] bla = fsHar.listStatus(harPath);
			
 
				     Path outdir = new Path(fs.getHomeDirectory(), "mapout"); 
			
 
				     JobConf jobconf = mapred.createJobConf();
			
 
				     FileInputFormat.addInputPath(jobconf, harPath);
			
@@ -229,12 +360,69 @@ public class TestHarFileSystem extends TestCase {
 
				     FileStatus[] status = fs.globStatus(new Path(outdir, "part*"));
			
 
				     Path reduceFile = status[0].getPath();
			
 
				     FSDataInputStream reduceIn = fs.open(reduceFile);
			
 
				-    b = new byte[8];
			
 
				-    reduceIn.read(b);
			
 
				-    //assuming all the 8 bytes were read.
			
 
				+    b = new byte[6];
			
 
				+    readBytes = reduceIn.read(b);
			
 
				+    assertTrue("Should read 6 bytes instead of "+readBytes+".", readBytes == 6);
			
 
				+    //assuming all the 6 bytes were read.
			
 
				     Text readTxt = new Text(b);
			
 
				-    assertTrue("a\nb\nc\nd\n".equals(readTxt.toString()));
			
 
				+    assertTrue("a\nb\nc\n".equals(readTxt.toString()));
			
 
				     assertTrue("number of bytes left should be -1", reduceIn.read(b) == -1);
			
 
				     reduceIn.close();
			
 
				   }
			
 
				+  
			
 
				+  public void testGetFileBlockLocations() throws Exception {
			
 
				+    fs.delete(archivePath, true);
			
 
				+    Configuration conf = mapred.createJobConf();
			
 
				+    HadoopArchives har = new HadoopArchives(conf);
			
 
				+    String[] args = new String[8];
			
 
				+    args[0] = "-Dhar.block.size=512";
			
 
				+    args[1] = "-Dhar.partfile.size=1";
			
 
				+    args[2] = "-archiveName";
			
 
				+    args[3] = "foo bar.har";
			
 
				+    args[4] = "-p";
			
 
				+    args[5] = fs.getHomeDirectory().toString();
			
 
				+    args[6] = "test";
			
 
				+    args[7] = archivePath.toString();
			
 
				+    int ret = ToolRunner.run(har, args);
			
 
				+    assertTrue("failed test", ret == 0);
			
 
				+    Path finalPath = new Path(archivePath, "foo bar.har");
			
 
				+    Path fsPath = new Path(inputPath.toUri().getPath());
			
 
				+    Path filePath = new Path(finalPath, "test");
			
 
				+    Path filea = new Path(filePath, "a");
			
 
				+    // make it a har path
			
 
				+    Path harPath = new Path("har://" + filea.toUri().getPath());
			
 
				+    FileSystem harFs = harPath.getFileSystem(conf);
			
 
				+    FileStatus[] statuses = harFs.listStatus(filePath);
			
 
				+    for (FileStatus status : statuses) {
			
 
				+      BlockLocation[] locations =
			
 
				+        harFs.getFileBlockLocations(status, 0, status.getLen());
			
 
				+      long lastOffset = 0;
			
 
				+      assertEquals("Only one block location expected for files this small",
			
 
				+                   1, locations.length);
			
 
				+      assertEquals("Block location should start at offset 0",
			
 
				+                   0, locations[0].getOffset());
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  public void testSpaces() throws Exception {
			
 
				+     fs.delete(archivePath, true);
			
 
				+     Configuration conf = mapred.createJobConf();
			
 
				+     HadoopArchives har = new HadoopArchives(conf);
			
 
				+     String[] args = new String[6];
			
 
				+     args[0] = "-archiveName";
			
 
				+     args[1] = "foo bar.har";
			
 
				+     args[2] = "-p";
			
 
				+     args[3] = fs.getHomeDirectory().toString();
			
 
				+     args[4] = "test";
			
 
				+     args[5] = archivePath.toString();
			
 
				+     int ret = ToolRunner.run(har, args);
			
 
				+     assertTrue("failed test", ret == 0);
			
 
				+     Path finalPath = new Path(archivePath, "foo bar.har");
			
 
				+     Path fsPath = new Path(inputPath.toUri().getPath());
			
 
				+     Path filePath = new Path(finalPath, "test");
			
 
				+     // make it a har path
			
 
				+     Path harPath = new Path("har://" + filePath.toUri().getPath());
			
 
				+     FileSystem harFs = harPath.getFileSystem(conf);
			
 
				+     FileStatus[] statuses = harFs.listStatus(finalPath);
			
 
				+  }
			
 
				 }
			
--- a/src/tools/org/apache/hadoop/tools/HadoopArchives.java
+++ b/src/tools/org/apache/hadoop/tools/HadoopArchives.java
@@ -18,14 +18,18 @@
 
				 
			
 
				 package org.apache.hadoop.tools;
			
 
				 
			
 
				+import java.io.DataInput;
			
 
				+import java.io.DataOutput;
			
 
				 import java.io.FileNotFoundException;
			
 
				 import java.io.IOException;
			
 
				-
			
 
				+import java.io.UnsupportedEncodingException;
			
 
				+import java.net.URLEncoder;
			
 
				 import java.util.ArrayList;
			
 
				 import java.util.HashSet;
			
 
				 import java.util.Iterator;
			
 
				 import java.util.List;
			
 
				 import java.util.Map;
			
 
				+import java.util.Random;
			
 
				 import java.util.Set;
			
 
				 import java.util.TreeMap;
			
 
				 
			
@@ -38,10 +42,12 @@ import org.apache.hadoop.fs.FileStatus;
 
				 import org.apache.hadoop.fs.FileSystem;
			
 
				 import org.apache.hadoop.fs.HarFileSystem;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.fs.permission.FsPermission;
			
 
				 import org.apache.hadoop.io.IntWritable;
			
 
				 import org.apache.hadoop.io.LongWritable;
			
 
				 import org.apache.hadoop.io.SequenceFile;
			
 
				 import org.apache.hadoop.io.Text;
			
 
				+import org.apache.hadoop.io.Writable;
			
 
				 import org.apache.hadoop.mapred.FileInputFormat;
			
 
				 import org.apache.hadoop.mapred.FileOutputFormat;
			
 
				 import org.apache.hadoop.mapred.FileSplit;
			
@@ -53,9 +59,11 @@ import org.apache.hadoop.mapred.Mapper;
 
				 import org.apache.hadoop.mapred.OutputCollector;
			
 
				 import org.apache.hadoop.mapred.RecordReader;
			
 
				 import org.apache.hadoop.mapred.Reducer;
			
 
				-import org.apache.hadoop.mapred.SequenceFileRecordReader;
			
 
				 import org.apache.hadoop.mapred.Reporter;
			
 
				+import org.apache.hadoop.mapred.SequenceFileRecordReader;
			
 
				 import org.apache.hadoop.mapred.lib.NullOutputFormat;
			
 
				+import org.apache.hadoop.mapreduce.JobContext;
			
 
				+import org.apache.hadoop.mapreduce.JobSubmissionFiles;
			
 
				 import org.apache.hadoop.util.Tool;
			
 
				 import org.apache.hadoop.util.ToolRunner;
			
 
				 
			
@@ -67,6 +75,7 @@ import org.apache.hadoop.util.ToolRunner;
 
				  * Hadoop archives look at {@link HarFileSystem}.
			
 
				  */
			
 
				 public class HadoopArchives implements Tool {
			
 
				+  public static final int VERSION = 3;
			
 
				   private static final Log LOG = LogFactory.getLog(HadoopArchives.class);
			
 
				   
			
 
				   private static final String NAME = "har"; 
			
@@ -77,12 +86,19 @@ public class HadoopArchives implements Tool {
 
				   static final String SRC_COUNT_LABEL = NAME + ".src.count";
			
 
				   static final String TOTAL_SIZE_LABEL = NAME + ".total.size";
			
 
				   static final String DST_HAR_LABEL = NAME + ".archive.name";
			
 
				-  // size of each part file
			
 
				-  // its fixed for now.
			
 
				-  static final long partSize = 2 * 1024 * 1024 * 1024l;
			
 
				+  static final String SRC_PARENT_LABEL = NAME + ".parent.path";
			
 
				+  /** the size of the blocks that will be created when archiving **/
			
 
				+  static final String HAR_BLOCKSIZE_LABEL = NAME + ".block.size";
			
 
				+  /**the size of the part files that will be created when archiving **/
			
 
				+  static final String HAR_PARTSIZE_LABEL = NAME + ".partfile.size";
			
 
				+
			
 
				+  /** size of each part file size **/
			
 
				+  long partSize = 2 * 1024 * 1024 * 1024l;
			
 
				+  /** size of blocks in hadoop archives **/
			
 
				+  long blockSize = 512 * 1024 * 1024l;
			
 
				 
			
 
				   private static final String usage = "archive"
			
 
				-  + " -archiveName NAME <src>* <dest>" +
			
 
				+  + " -archiveName NAME -p <parent path> <src>* <dest>" +
			
 
				   "\n";
			
 
				   
			
 
				  
			
@@ -118,22 +134,68 @@ public class HadoopArchives implements Tool {
 
				   /**
			
 
				    * this assumes that there are two types of files file/dir
			
 
				    * @param fs the input filesystem
			
 
				-   * @param p the top level path 
			
 
				+   * @param fdir the filestatusdir of the path  
			
 
				    * @param out the list of paths output of recursive ls
			
 
				    * @throws IOException
			
 
				    */
			
 
				-  private void recursivels(FileSystem fs, Path p, List<FileStatus> out) 
			
 
				+  private void recursivels(FileSystem fs, FileStatusDir fdir, List<FileStatusDir> out) 
			
 
				   throws IOException {
			
 
				-    FileStatus fstatus = fs.getFileStatus(p);
			
 
				-    if (!fstatus.isDir()) {
			
 
				-      out.add(fstatus);
			
 
				+    if (!fdir.getFileStatus().isDir()) {
			
 
				+      out.add(fdir);
			
 
				       return;
			
 
				     }
			
 
				     else {
			
 
				-      out.add(fstatus);
			
 
				-      FileStatus[] listStatus = fs.listStatus(p);
			
 
				+      out.add(fdir);
			
 
				+      FileStatus[] listStatus = fs.listStatus(fdir.getFileStatus().getPath());
			
 
				+      fdir.setChildren(listStatus);
			
 
				       for (FileStatus stat: listStatus) {
			
 
				-        recursivels(fs, stat.getPath(), out);
			
 
				+        FileStatusDir fstatDir = new FileStatusDir(stat, null);
			
 
				+        recursivels(fs, fstatDir, out);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /** HarEntry is used in the {@link HArchivesMapper} as the input value. */
			
 
				+  private static class HarEntry implements Writable {
			
 
				+    String path;
			
 
				+    String[] children;
			
 
				+
			
 
				+    HarEntry() {}
			
 
				+    
			
 
				+    HarEntry(String path, String[] children) {
			
 
				+      this.path = path;
			
 
				+      this.children = children;
			
 
				+    }
			
 
				+
			
 
				+    boolean isDir() {
			
 
				+      return children != null;      
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public void readFields(DataInput in) throws IOException {
			
 
				+      path = Text.readString(in);
			
 
				+
			
 
				+      if (in.readBoolean()) {
			
 
				+        children = new String[in.readInt()];
			
 
				+        for(int i = 0; i < children.length; i++) {
			
 
				+          children[i] = Text.readString(in);
			
 
				+        }
			
 
				+      } else {
			
 
				+        children = null;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public void write(DataOutput out) throws IOException {
			
 
				+      Text.writeString(out, path);
			
 
				+
			
 
				+      final boolean dir = isDir();
			
 
				+      out.writeBoolean(dir);
			
 
				+      if (dir) {
			
 
				+        out.writeInt(children.length);
			
 
				+        for(String c : children) {
			
 
				+          Text.writeString(out, c);
			
 
				+        }
			
 
				       }
			
 
				     }
			
 
				   }
			
@@ -142,8 +204,7 @@ public class HadoopArchives implements Tool {
 
				    * Input format of a hadoop archive job responsible for 
			
 
				    * generating splits of the file list
			
 
				    */
			
 
				-
			
 
				-  static class HArchiveInputFormat implements InputFormat<LongWritable, Text> {
			
 
				+  static class HArchiveInputFormat implements InputFormat<LongWritable, HarEntry> {
			
 
				 
			
 
				     //generate input splits from the src file lists
			
 
				     public InputSplit[] getSplits(JobConf jconf, int numSplits)
			
@@ -163,7 +224,7 @@ public class HadoopArchives implements Tool {
 
				       FileStatus fstatus = fs.getFileStatus(src);
			
 
				       ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
			
 
				       LongWritable key = new LongWritable();
			
 
				-      Text value = new Text();
			
 
				+      final HarEntry value = new HarEntry();
			
 
				       SequenceFile.Reader reader = null;
			
 
				       // the remaining bytes in the file split
			
 
				       long remaining = fstatus.getLen();
			
@@ -200,9 +261,10 @@ public class HadoopArchives implements Tool {
 
				       return splits.toArray(new FileSplit[splits.size()]);
			
 
				     }
			
 
				 
			
 
				-    public RecordReader<LongWritable, Text> getRecordReader(InputSplit split,
			
 
				+    @Override
			
 
				+    public RecordReader<LongWritable, HarEntry> getRecordReader(InputSplit split,
			
 
				         JobConf job, Reporter reporter) throws IOException {
			
 
				-      return new SequenceFileRecordReader<LongWritable, Text>(job,
			
 
				+      return new SequenceFileRecordReader<LongWritable, HarEntry>(job,
			
 
				                  (FileSplit)split);
			
 
				     }
			
 
				   }
			
@@ -228,24 +290,53 @@ public class HadoopArchives implements Tool {
 
				     return deepest;
			
 
				   }
			
 
				   
			
 
				-  // this method is tricky. This method writes 
			
 
				-  // the top level directories in such a way so that 
			
 
				-  // the output only contains valid directoreis in archives.
			
 
				-  // so for an input path specified by the user 
			
 
				-  // as /user/hadoop
			
 
				-  // we need to index 
			
 
				-  // / as the root 
			
 
				-  // /user as a directory
			
 
				-  // /user/hadoop as a directory
			
 
				-  // so for multiple input paths it makes sure that it
			
 
				-  // does the right thing.
			
 
				-  // so if the user specifies the input directories as 
			
 
				-  // /user/harry and /user/hadoop
			
 
				-  // we need to write / and user as its child
			
 
				-  // and /user and harry and hadoop as its children
			
 
				+  /**
			
 
				+   * truncate the prefix root from the full path
			
 
				+   * @param fullPath the full path
			
 
				+   * @param root the prefix root to be truncated
			
 
				+   * @return the relative path
			
 
				+   */
			
 
				+  private Path relPathToRoot(Path fullPath, Path root) {
			
 
				+    // just take some effort to do it 
			
 
				+    // rather than just using substring 
			
 
				+    // so that we do not break sometime later
			
 
				+    final Path justRoot = new Path(Path.SEPARATOR);
			
 
				+    if (fullPath.depth() == root.depth()) {
			
 
				+      return justRoot;
			
 
				+    }
			
 
				+    else if (fullPath.depth() > root.depth()) {
			
 
				+      Path retPath = new Path(fullPath.getName());
			
 
				+      Path parent = fullPath.getParent();
			
 
				+      for (int i=0; i < (fullPath.depth() - root.depth() -1); i++) {
			
 
				+        retPath = new Path(parent.getName(), retPath);
			
 
				+        parent = parent.getParent();
			
 
				+      }
			
 
				+      return new Path(justRoot, retPath);
			
 
				+    }
			
 
				+    return null;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * this method writes all the valid top level directories 
			
 
				+   * into the srcWriter for indexing. This method is a little
			
 
				+   * tricky. example- 
			
 
				+   * for an input with parent path /home/user/ and sources 
			
 
				+   * as /home/user/source/dir1, /home/user/source/dir2 - this 
			
 
				+   * will output <source, dir, dir1, dir2> (dir means that source is a dir
			
 
				+   * with dir1 and dir2 as children) and <source/dir1, file, null>
			
 
				+   * and <source/dir2, file, null>
			
 
				+   * @param srcWriter the sequence file writer to write the
			
 
				+   * directories to
			
 
				+   * @param paths the source paths provided by the user. They
			
 
				+   * are glob free and have full path (not relative paths)
			
 
				+   * @param parentPath the parent path that you wnat the archives
			
 
				+   * to be relative to. example - /home/user/dir1 can be archived with
			
 
				+   * parent as /home or /home/user.
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				   private void writeTopLevelDirs(SequenceFile.Writer srcWriter, 
			
 
				-      List<Path> paths) throws IOException {
			
 
				-    //these are qualified paths 
			
 
				+      List<Path> paths, Path parentPath) throws IOException {
			
 
				+    //add all the directories 
			
 
				     List<Path> justDirs = new ArrayList<Path>();
			
 
				     for (Path p: paths) {
			
 
				       if (!p.getFileSystem(getConf()).isFile(p)) {
			
@@ -255,17 +346,23 @@ public class HadoopArchives implements Tool {
 
				         justDirs.add(new Path(p.getParent().toUri().getPath()));
			
 
				       }
			
 
				     }
			
 
				-    
			
 
				-    //get the largest depth path
			
 
				-    // this is tricky
			
 
				-    TreeMap<String, HashSet<String>> allpaths = new TreeMap<String, HashSet<String>>();
			
 
				+    /* find all the common parents of paths that are valid archive
			
 
				+     * paths. The below is done so that we do not add a common path
			
 
				+     * twice and also we need to only add valid child of a path that
			
 
				+     * are specified the user.
			
 
				+     */
			
 
				+    TreeMap<String, HashSet<String>> allpaths = new TreeMap<String, 
			
 
				+                                                HashSet<String>>();
			
 
				+    /* the largest depth of paths. the max number of times
			
 
				+     * we need to iterate
			
 
				+     */
			
 
				     Path deepest = largestDepth(paths);
			
 
				     Path root = new Path(Path.SEPARATOR);
			
 
				-    for (int i = 0; i < deepest.depth(); i++) {
			
 
				+    for (int i = parentPath.depth(); i < deepest.depth(); i++) {
			
 
				       List<Path> parents = new ArrayList<Path>();
			
 
				       for (Path p: justDirs) {
			
 
				         if (p.compareTo(root) == 0){
			
 
				-          //don nothing
			
 
				+          //do nothing
			
 
				         }
			
 
				         else {
			
 
				           Path parent = p.getParent();
			
@@ -287,53 +384,118 @@ public class HadoopArchives implements Tool {
 
				     }
			
 
				     Set<Map.Entry<String, HashSet<String>>> keyVals = allpaths.entrySet();
			
 
				     for (Map.Entry<String, HashSet<String>> entry : keyVals) {
			
 
				-      HashSet<String> children = entry.getValue();
			
 
				-      String toWrite = entry.getKey() + " dir ";
			
 
				-      StringBuffer sbuff = new StringBuffer();
			
 
				-      sbuff.append(toWrite);
			
 
				-      for (String child: children) {
			
 
				-        sbuff.append(child + " ");
			
 
				+      final Path relPath = relPathToRoot(new Path(entry.getKey()), parentPath);
			
 
				+      if (relPath != null) {
			
 
				+        final String[] children = new String[entry.getValue().size()];
			
 
				+        int i = 0;
			
 
				+        for(String child: entry.getValue()) {
			
 
				+          children[i++] = child;
			
 
				+        }
			
 
				+        append(srcWriter, 0L, relPath.toString(), children);
			
 
				       }
			
 
				-      toWrite = sbuff.toString();
			
 
				-      srcWriter.append(new LongWritable(0L), new Text(toWrite));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  private void append(SequenceFile.Writer srcWriter, long len,
			
 
				+      String path, String[] children) throws IOException {
			
 
				+    srcWriter.append(new LongWritable(len), new HarEntry(path, children));
			
 
				+  }
			
 
				+    
			
 
				+  /**
			
 
				+   * A static class that keeps
			
 
				+   * track of status of a path 
			
 
				+   * and there children if path is a dir
			
 
				+   */
			
 
				+  static class FileStatusDir {
			
 
				+    private FileStatus fstatus;
			
 
				+    private FileStatus[] children = null;
			
 
				+    
			
 
				+    /**
			
 
				+     * constructor for filestatusdir
			
 
				+     * @param fstatus the filestatus object that maps to filestatusdir
			
 
				+     * @param children the children list if fs is a directory
			
 
				+     */
			
 
				+    FileStatusDir(FileStatus fstatus, FileStatus[] children) {
			
 
				+      this.fstatus  = fstatus;
			
 
				+      this.children = children;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * set children of this object
			
 
				+     * @param listStatus the list of children
			
 
				+     */
			
 
				+    public void setChildren(FileStatus[] listStatus) {
			
 
				+      this.children = listStatus;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * the filestatus of this object
			
 
				+     * @return the filestatus of this object
			
 
				+     */
			
 
				+    FileStatus getFileStatus() {
			
 
				+      return this.fstatus;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * the children list of this object, null if  
			
 
				+     * @return the children list
			
 
				+     */
			
 
				+    FileStatus[] getChildren() {
			
 
				+      return this.children;
			
 
				     }
			
 
				   }
			
 
				   
			
 
				   /**archive the given source paths into
			
 
				    * the dest
			
 
				+   * @param parentPath the parent path of all the source paths
			
 
				    * @param srcPaths the src paths to be archived
			
 
				    * @param dest the dest dir that will contain the archive
			
 
				    */
			
 
				-  public void archive(List<Path> srcPaths, String archiveName, Path dest) 
			
 
				-  throws IOException {
			
 
				+  void archive(Path parentPath, List<Path> srcPaths, 
			
 
				+      String archiveName, Path dest) throws IOException {
			
 
				     checkPaths(conf, srcPaths);
			
 
				     int numFiles = 0;
			
 
				     long totalSize = 0;
			
 
				+    FileSystem fs = parentPath.getFileSystem(conf);
			
 
				+    this.blockSize = conf.getLong(HAR_BLOCKSIZE_LABEL, blockSize);
			
 
				+    this.partSize = conf.getLong(HAR_PARTSIZE_LABEL, partSize);
			
 
				+    conf.setLong(HAR_BLOCKSIZE_LABEL, blockSize);
			
 
				+    conf.setLong(HAR_PARTSIZE_LABEL, partSize);
			
 
				     conf.set(DST_HAR_LABEL, archiveName);
			
 
				+    conf.set(SRC_PARENT_LABEL, parentPath.makeQualified(fs).toString());
			
 
				     Path outputPath = new Path(dest, archiveName);
			
 
				     FileOutputFormat.setOutputPath(conf, outputPath);
			
 
				     FileSystem outFs = outputPath.getFileSystem(conf);
			
 
				     if (outFs.exists(outputPath) || outFs.isFile(dest)) {
			
 
				-      throw new IOException("Invalid Output.");
			
 
				+      throw new IOException("Invalid Output: " + outputPath);
			
 
				     }
			
 
				     conf.set(DST_DIR_LABEL, outputPath.toString());
			
 
				-    final String randomId = DistCp.getRandomId();
			
 
				-    Path jobDirectory = new Path(new JobClient(conf).getSystemDir(),
			
 
				-                          NAME + "_" + randomId);
			
 
				+    JobClient jClient = new JobClient(conf);
			
 
				+    Path stagingArea;
			
 
				+    try {
			
 
				+      stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf);
			
 
				+    } catch (InterruptedException ie) {
			
 
				+      throw new IOException(ie);
			
 
				+    }
			
 
				+    Path jobDirectory = new Path(stagingArea,
			
 
				+        NAME+"_"+Integer.toString(new Random().nextInt(Integer.MAX_VALUE), 36));
			
 
				+    FsPermission mapredSysPerms = 
			
 
				+      new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
			
 
				+    FileSystem.mkdirs(jobDirectory.getFileSystem(conf), jobDirectory,
			
 
				+                      mapredSysPerms);
			
 
				     conf.set(JOB_DIR_LABEL, jobDirectory.toString());
			
 
				     //get a tmp directory for input splits
			
 
				     FileSystem jobfs = jobDirectory.getFileSystem(conf);
			
 
				-    jobfs.mkdirs(jobDirectory);
			
 
				     Path srcFiles = new Path(jobDirectory, "_har_src_files");
			
 
				     conf.set(SRC_LIST_LABEL, srcFiles.toString());
			
 
				     SequenceFile.Writer srcWriter = SequenceFile.createWriter(jobfs, conf,
			
 
				-        srcFiles, LongWritable.class, Text.class, 
			
 
				+        srcFiles, LongWritable.class, HarEntry.class, 
			
 
				         SequenceFile.CompressionType.NONE);
			
 
				     // get the list of files 
			
 
				     // create single list of files and dirs
			
 
				     try {
			
 
				       // write the top level dirs in first 
			
 
				-      writeTopLevelDirs(srcWriter, srcPaths);
			
 
				+      writeTopLevelDirs(srcWriter, srcPaths, parentPath);
			
 
				       srcWriter.sync();
			
 
				       // these are the input paths passed 
			
 
				       // from the command line
			
@@ -341,28 +503,27 @@ public class HadoopArchives implements Tool {
 
				       // and then write them to the input file 
			
 
				       // one at a time
			
 
				       for (Path src: srcPaths) {
			
 
				-        FileSystem fs = src.getFileSystem(conf);
			
 
				-        ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>();
			
 
				-        recursivels(fs, src, allFiles);
			
 
				-        for (FileStatus stat: allFiles) {
			
 
				-          String toWrite = "";
			
 
				+        ArrayList<FileStatusDir> allFiles = new ArrayList<FileStatusDir>();
			
 
				+        FileStatus fstatus = fs.getFileStatus(src);
			
 
				+        FileStatusDir fdir = new FileStatusDir(fstatus, null);
			
 
				+        recursivels(fs, fdir, allFiles);
			
 
				+        for (FileStatusDir statDir: allFiles) {
			
 
				+          FileStatus stat = statDir.getFileStatus();
			
 
				           long len = stat.isDir()? 0:stat.getLen();
			
 
				+          final Path path = relPathToRoot(stat.getPath(), parentPath);
			
 
				+          final String[] children;
			
 
				           if (stat.isDir()) {
			
 
				-            toWrite = "" + fs.makeQualified(stat.getPath()) + " dir ";
			
 
				             //get the children 
			
 
				-            FileStatus[] list = fs.listStatus(stat.getPath());
			
 
				-            StringBuffer sbuff = new StringBuffer();
			
 
				-            sbuff.append(toWrite);
			
 
				-            for (FileStatus stats: list) {
			
 
				-              sbuff.append(stats.getPath().getName() + " ");
			
 
				+            FileStatus[] list = statDir.getChildren();
			
 
				+            children = new String[list.length];
			
 
				+            for (int i = 0; i < list.length; i++) {
			
 
				+              children[i] = list[i].getPath().getName();
			
 
				             }
			
 
				-            toWrite = sbuff.toString();
			
 
				           }
			
 
				           else {
			
 
				-            toWrite +=  fs.makeQualified(stat.getPath()) + " file ";
			
 
				+            children = null;
			
 
				           }
			
 
				-          srcWriter.append(new LongWritable(len), new 
			
 
				-              Text(toWrite));
			
 
				+          append(srcWriter, len, path.toString(), children);
			
 
				           srcWriter.sync();
			
 
				           numFiles++;
			
 
				           totalSize += len;
			
@@ -399,23 +560,26 @@ public class HadoopArchives implements Tool {
 
				   }
			
 
				 
			
 
				   static class HArchivesMapper 
			
 
				-  implements Mapper<LongWritable, Text, IntWritable, Text> {
			
 
				+  implements Mapper<LongWritable, HarEntry, IntWritable, Text> {
			
 
				     private JobConf conf = null;
			
 
				     int partId = -1 ; 
			
 
				     Path tmpOutputDir = null;
			
 
				     Path tmpOutput = null;
			
 
				     String partname = null;
			
 
				+    Path rootPath = null;
			
 
				     FSDataOutputStream partStream = null;
			
 
				     FileSystem destFs = null;
			
 
				     byte[] buffer;
			
 
				     int buf_size = 128 * 1024;
			
 
				-    
			
 
				+    long blockSize = 512 * 1024 * 1024l;
			
 
				+
			
 
				     // configure the mapper and create 
			
 
				     // the part file.
			
 
				     // use map reduce framework to write into
			
 
				     // tmp files. 
			
 
				     public void configure(JobConf conf) {
			
 
				       this.conf = conf;
			
 
				+
			
 
				       // this is tightly tied to map reduce
			
 
				       // since it does not expose an api 
			
 
				       // to get the partition
			
@@ -423,19 +587,27 @@ public class HadoopArchives implements Tool {
 
				       // create a file name using the partition
			
 
				       // we need to write to this directory
			
 
				       tmpOutputDir = FileOutputFormat.getWorkOutputPath(conf);
			
 
				+      blockSize = conf.getLong(HAR_BLOCKSIZE_LABEL, blockSize);
			
 
				       // get the output path and write to the tmp 
			
 
				       // directory 
			
 
				       partname = "part-" + partId;
			
 
				       tmpOutput = new Path(tmpOutputDir, partname);
			
 
				+      rootPath = (conf.get(SRC_PARENT_LABEL, null) == null) ? null :
			
 
				+                  new Path(conf.get(SRC_PARENT_LABEL));
			
 
				+      if (rootPath == null) {
			
 
				+        throw new RuntimeException("Unable to read parent " +
			
 
				+        		"path for har from config");
			
 
				+      }
			
 
				       try {
			
 
				         destFs = tmpOutput.getFileSystem(conf);
			
 
				         //this was a stale copy
			
 
				         if (destFs.exists(tmpOutput)) {
			
 
				           destFs.delete(tmpOutput, false);
			
 
				-        }
			
 
				-        partStream = destFs.create(tmpOutput);
			
 
				+        } 
			
 
				+        partStream = destFs.create(tmpOutput, false, conf.getInt("io.file.buffer.size", 4096), 
			
 
				+            destFs.getDefaultReplication(), blockSize);
			
 
				       } catch(IOException ie) {
			
 
				-        throw new RuntimeException("Unable to open output file " + tmpOutput);
			
 
				+        throw new RuntimeException("Unable to open output file " + tmpOutput, ie);
			
 
				       }
			
 
				       buffer = new byte[buf_size];
			
 
				     }
			
@@ -453,71 +625,70 @@ public class HadoopArchives implements Tool {
 
				       }
			
 
				     }
			
 
				     
			
 
				-    // the relative path of p. basically 
			
 
				-    // getting rid of schema. Parsing and doing 
			
 
				-    // string manipulation is not good - so
			
 
				-    // just use the path api to do it.
			
 
				-    private Path makeRelative(Path p) {
			
 
				-      Path retPath = new Path(p.toUri().getPath());
			
 
				-      return retPath;
			
 
				-    }
			
 
				-    
			
 
				-    static class MapStat {
			
 
				-      private String pathname;
			
 
				-      private boolean isDir;
			
 
				-      private List<String> children;
			
 
				-      public MapStat(String line) {
			
 
				-        String[] splits = line.split(" ");
			
 
				-        pathname = splits[0];
			
 
				-        if ("dir".equals(splits[1])) {
			
 
				-          isDir = true;
			
 
				-        }
			
 
				-        else {
			
 
				-          isDir = false;
			
 
				-        }
			
 
				-        if (isDir) {
			
 
				-          children = new ArrayList<String>();
			
 
				-          for (int i = 2; i < splits.length; i++) {
			
 
				-            children.add(splits[i]);
			
 
				-          }
			
 
				-        }
			
 
				+    /**
			
 
				+     * get rid of / in the beginning of path
			
 
				+     * @param p the path
			
 
				+     * @return return path without /
			
 
				+     */
			
 
				+    private Path realPath(Path p, Path parent) {
			
 
				+      Path rootPath = new Path(Path.SEPARATOR);
			
 
				+      if (rootPath.compareTo(p) == 0) {
			
 
				+        return parent;
			
 
				       }
			
 
				+      return new Path(parent, new Path(p.toString().substring(1)));
			
 
				     }
			
 
				+
			
 
				+    private static String encodeName(String s) 
			
 
				+      throws UnsupportedEncodingException {
			
 
				+      return URLEncoder.encode(s,"UTF-8");
			
 
				+    }
			
 
				+
			
 
				+    private static String encodeProperties( FileStatus fStatus )
			
 
				+      throws UnsupportedEncodingException {
			
 
				+      String propStr = encodeName(
			
 
				+          fStatus.getModificationTime() + " "
			
 
				+        + fStatus.getPermission().toShort() + " "
			
 
				+        + encodeName(fStatus.getOwner()) + " "
			
 
				+        + encodeName(fStatus.getGroup()));
			
 
				+      return propStr;
			
 
				+    }
			
 
				+
			
 
				     // read files from the split input 
			
 
				     // and write it onto the part files.
			
 
				     // also output hash(name) and string 
			
 
				     // for reducer to create index 
			
 
				     // and masterindex files.
			
 
				-    public void map(LongWritable key, Text value,
			
 
				+    public void map(LongWritable key, HarEntry value,
			
 
				         OutputCollector<IntWritable, Text> out,
			
 
				         Reporter reporter) throws IOException {
			
 
				-      String line  = value.toString();
			
 
				-      MapStat mstat = new MapStat(line);
			
 
				-      Path srcPath = new Path(mstat.pathname);
			
 
				-      String towrite = null;
			
 
				-      Path relPath = makeRelative(srcPath);
			
 
				+      Path relPath = new Path(value.path);
			
 
				       int hash = HarFileSystem.getHarHash(relPath);
			
 
				+      String towrite = null;
			
 
				+      Path srcPath = realPath(relPath, rootPath);
			
 
				       long startPos = partStream.getPos();
			
 
				-      if (mstat.isDir) { 
			
 
				-        towrite = relPath.toString() + " " + "dir none " + 0 + " " + 0 + " ";
			
 
				+      FileSystem srcFs = srcPath.getFileSystem(conf);
			
 
				+      FileStatus srcStatus = srcFs.getFileStatus(srcPath);
			
 
				+      String propStr = encodeProperties(srcStatus);
			
 
				+      if (value.isDir()) { 
			
 
				+        towrite = encodeName(relPath.toString())
			
 
				+                  + " dir " + propStr + " 0 0 ";
			
 
				         StringBuffer sbuff = new StringBuffer();
			
 
				         sbuff.append(towrite);
			
 
				-        for (String child: mstat.children) {
			
 
				-          sbuff.append(child + " ");
			
 
				+        for (String child: value.children) {
			
 
				+          sbuff.append(encodeName(child) + " ");
			
 
				         }
			
 
				         towrite = sbuff.toString();
			
 
				         //reading directories is also progress
			
 
				         reporter.progress();
			
 
				       }
			
 
				       else {
			
 
				-        FileSystem srcFs = srcPath.getFileSystem(conf);
			
 
				-        FileStatus srcStatus = srcFs.getFileStatus(srcPath);
			
 
				         FSDataInputStream input = srcFs.open(srcStatus.getPath());
			
 
				         reporter.setStatus("Copying file " + srcStatus.getPath() + 
			
 
				             " to archive.");
			
 
				         copyData(srcStatus.getPath(), input, partStream, reporter);
			
 
				-        towrite = relPath.toString() + " file " + partname + " " + startPos
			
 
				-        + " " + srcStatus.getLen() + " ";
			
 
				+        towrite = encodeName(relPath.toString())
			
 
				+                  + " file " + partname + " " + startPos
			
 
				+                  + " " + srcStatus.getLen() + " " + propStr + " ";
			
 
				       }
			
 
				       out.collect(new IntWritable(hash), new Text(towrite));
			
 
				     }
			
@@ -563,7 +734,7 @@ public class HadoopArchives implements Tool {
 
				         }
			
 
				         indexStream = fs.create(index);
			
 
				         outStream = fs.create(masterIndex);
			
 
				-        String version = HarFileSystem.VERSION + " \n";
			
 
				+        String version = VERSION + " \n";
			
 
				         outStream.write(version.getBytes());
			
 
				         
			
 
				       } catch(IOException e) {
			
@@ -611,27 +782,26 @@ public class HadoopArchives implements Tool {
 
				       outStream.close();
			
 
				       indexStream.close();
			
 
				       // try increasing the replication 
			
 
				-      fs.setReplication(index, (short) 10);
			
 
				-      fs.setReplication(masterIndex, (short) 10);
			
 
				+      fs.setReplication(index, (short) 5);
			
 
				+      fs.setReplication(masterIndex, (short) 5);
			
 
				     }
			
 
				     
			
 
				   }
			
 
				   
			
 
				   /** the main driver for creating the archives
			
 
				-   *  it takes at least two command line parameters. The src and the 
			
 
				-   *  dest. It does an lsr on the source paths.
			
 
				+   *  it takes at least three command line parameters. The parent path, 
			
 
				+   *  The src and the dest. It does an lsr on the source paths.
			
 
				    *  The mapper created archuves and the reducer creates 
			
 
				    *  the archive index.
			
 
				    */
			
 
				 
			
 
				   public int run(String[] args) throws Exception {
			
 
				     try {
			
 
				+      Path parentPath = null;
			
 
				       List<Path> srcPaths = new ArrayList<Path>();
			
 
				       Path destPath = null;
			
 
				-      // check we were supposed to archive or 
			
 
				-      // unarchive
			
 
				       String archiveName = null;
			
 
				-      if (args.length < 4) {
			
 
				+      if (args.length < 5) {
			
 
				         System.out.println(usage);
			
 
				         throw new IOException("Invalid usage.");
			
 
				       }
			
@@ -644,28 +814,52 @@ public class HadoopArchives implements Tool {
 
				         System.out.println(usage);
			
 
				         throw new IOException("Invalid name for archives. " + archiveName);
			
 
				       }
			
 
				-      for (int i = 2; i < args.length; i++) {
			
 
				+      int i = 2;
			
 
				+      //check to see if relative parent has been provided or not
			
 
				+      //this is a required parameter. 
			
 
				+      if (! "-p".equals(args[i])) {
			
 
				+        System.out.println(usage);
			
 
				+        throw new IOException("Parent path not specified.");
			
 
				+      }
			
 
				+      parentPath = new Path(args[i+1]);
			
 
				+      i+=2;
			
 
				+      //read the rest of the paths
			
 
				+      for (; i < args.length; i++) {
			
 
				         if (i == (args.length - 1)) {
			
 
				           destPath = new Path(args[i]);
			
 
				         }
			
 
				         else {
			
 
				-          srcPaths.add(new Path(args[i]));
			
 
				+          Path argPath = new Path(args[i]);
			
 
				+          if (argPath.isAbsolute()) {
			
 
				+            System.out.println(usage);
			
 
				+            throw new IOException("source path " + argPath +
			
 
				+                " is not relative  to "+ parentPath);
			
 
				+          }
			
 
				+          srcPaths.add(new Path(parentPath, argPath));
			
 
				         }
			
 
				       }
			
 
				       if (srcPaths.size() == 0) {
			
 
				-        System.out.println(usage);
			
 
				-        throw new IOException("Invalid Usage: No input sources specified.");
			
 
				+        // assuming if the user does not specify path for sources
			
 
				+        // the whole parent directory needs to be archived. 
			
 
				+        srcPaths.add(parentPath);
			
 
				       }
			
 
				       // do a glob on the srcPaths and then pass it on
			
 
				       List<Path> globPaths = new ArrayList<Path>();
			
 
				       for (Path p: srcPaths) {
			
 
				         FileSystem fs = p.getFileSystem(getConf());
			
 
				         FileStatus[] statuses = fs.globStatus(p);
			
 
				-        for (FileStatus status: statuses) {
			
 
				-          globPaths.add(fs.makeQualified(status.getPath()));
			
 
				+        if (statuses != null) {
			
 
				+          for (FileStatus status: statuses) {
			
 
				+            globPaths.add(fs.makeQualified(status.getPath()));
			
 
				+          }
			
 
				         }
			
 
				       }
			
 
				-      archive(globPaths, archiveName, destPath);
			
 
				+      if (globPaths.isEmpty()) {
			
 
				+        throw new IOException("The resolved paths set is empty."
			
 
				+            + "  Please check whether the srcPaths exist, where srcPaths = "
			
 
				+            + srcPaths);
			
 
				+      }
			
 
				+      archive(parentPath, globPaths, archiveName, destPath);
			
 
				     } catch(IOException ie) {
			
 
				       System.err.println(ie.getLocalizedMessage());
			
 
				       return -1;
			
@@ -683,8 +877,13 @@ public class HadoopArchives implements Tool {
 
				       ret = ToolRunner.run(harchives, args);
			
 
				     } catch(Exception e) {
			
 
				       LOG.debug("Exception in archives  ", e);
			
 
				-      System.err.println("Exception in archives");
			
 
				-      System.err.println(e.getLocalizedMessage());
			
 
				+      System.err.println(e.getClass().getSimpleName() + " in archives");
			
 
				+      final String s = e.getLocalizedMessage();
			
 
				+      if (s != null) {
			
 
				+        System.err.println(s);
			
 
				+      } else {
			
 
				+        e.printStackTrace(System.err);
			
 
				+      }
			
 
				       System.exit(1);
			
 
				     }
			
 
				     System.exit(ret);