فهرست منبع

HADOOP-6467. Improve the performance on HarFileSystem.listStatus(..). Contributed by mahadev

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@915168 13f79535-47bb-0310-9956-ffa450edef68
Tsz-wo Sze 15 سال پیش
والد
کامیت
4eedc77275
2فایلهای تغییر یافته به همراه68 افزوده شده و 23 حذف شده
  1. 3 0
      CHANGES.txt
  2. 65 23
      src/java/org/apache/hadoop/fs/HarFileSystem.java

+ 3 - 0
CHANGES.txt

@@ -163,6 +163,9 @@ Trunk (unreleased changes)
 
   OPTIMIZATIONS
 
+    HADOOP-6467. Improve the performance on HarFileSystem.listStatus(..).
+    (mahadev via szetszwo)
+
   BUG FIXES
 
     HADOOP-6293. Fix FsShell -text to work on filesystems other than the

+ 65 - 23
src/java/org/apache/hadoop/fs/HarFileSystem.java

@@ -325,25 +325,12 @@ public class HarFileSystem extends FilterFileSystem {
   @Override
   public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
       long len) throws IOException {
-    // need to look up the file in the underlying fs
-    // look up the index 
-    
-    // make sure this is a prt of this har filesystem
-    Path p = makeQualified(file.getPath());
-    Path harPath = getPathInHar(p);
-    String line = fileStatusInIndex(harPath);
-    if (line == null)  {
-      throw new FileNotFoundException("File " + file.getPath() + " not found");
-    }
-    HarStatus harStatus = new HarStatus(line);
-    if (harStatus.isDir()) {
-      return new BlockLocation[0];
-    }
-    FileStatus fsFile = fs.getFileStatus(new Path(archivePath,
-        harStatus.getPartName()));
-    BlockLocation[] rawBlocks = fs.getFileBlockLocations(fsFile, 
-        harStatus.getStartIndex() + start, len);
-    return fakeBlockLocations(rawBlocks, harStatus.getStartIndex());
+    // just fake block locations
+    // its fast and simpler
+    // doing various block location manipulation
+    // with part files adds a lot of overhead because 
+    // of the look ups of filestatus in index files
+    return new BlockLocation[]{ new BlockLocation() };
   }
   
   /**
@@ -387,6 +374,63 @@ public class HarFileSystem extends FilterFileSystem {
     public int endHash;
   }
   
+  /**
+   * Get filestatuses of all the children of a given directory. This just reads
+   * through index file and reads line by line to get all statuses for children
+   * of a directory. Its a brute force way of getting all such filestatuses
+   * 
+   * @param parent
+   *          the parent path directory
+   * @param statuses
+   *          the list to add the children filestatuses to
+   * @param children
+   *          the string list of children for this parent
+   * @param archiveIndexStat
+   *          the archive index filestatus
+   */
+  private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
+      List<String> children, FileStatus archiveIndexStat) throws IOException {
+    // read the index file
+    FSDataInputStream aIn = null;
+    try {
+      aIn = fs.open(archiveIndex);
+      LineReader aLin;
+      long read = 0;
+      aLin = new LineReader(aIn, getConf());
+      String parentString = parent.getName();
+      Path harPath = new Path(parentString);
+      int harlen = harPath.depth();
+      Text line = new Text();
+      while (read < archiveIndexStat.getLen()) {
+        int tmp = aLin.readLine(line);
+        read += tmp;
+        String lineFeed = line.toString();
+        String child = lineFeed.substring(0, lineFeed.indexOf(" "));
+        if ((child.startsWith(parentString))) {
+          Path thisPath = new Path(child);
+          if (thisPath.depth() == harlen + 1) {
+            // bingo!
+            HarStatus hstatus = new HarStatus(lineFeed);
+            FileStatus childStatus = new FileStatus(hstatus.isDir() ? 0
+                : hstatus.getLength(), hstatus.isDir(), (int) archiveIndexStat
+                .getReplication(), archiveIndexStat.getBlockSize(),
+                archiveIndexStat.getModificationTime(), archiveIndexStat
+                    .getAccessTime(), new FsPermission(archiveIndexStat
+                    .getPermission()), archiveIndexStat.getOwner(),
+                archiveIndexStat.getGroup(), makeRelative(this.uri.toString(),
+                    new Path(hstatus.name)));
+            statuses.add(childStatus);
+          }
+          line.clear();
+        }
+      }
+    } finally {
+      if (aIn != null) {
+        aIn.close();
+      }
+    }
+  }
+  
   // make sure that this harPath is relative to the har filesystem
   // this only works for relative paths. This returns the line matching
   // the file in the index. Returns a null if there is not matching 
@@ -650,10 +694,8 @@ public class HarFileSystem extends FilterFileSystem {
             archiveStatus.getOwner(), archiveStatus.getGroup(), 
             makeRelative(this.uri.toString(), new Path(hstatus.name))));
     else 
-      for (String child: hstatus.children) {
-        FileStatus tmp = getFileStatus(new Path(tmpPath, child));
-        statuses.add(tmp);
-      }
+      fileStatusesInIndex(hstatus, statuses, hstatus.children, archiveStatus);
+    
     return statuses.toArray(new FileStatus[statuses.size()]);
   }