16 年前 · 16aa60e5ff
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -11,6 +11,8 @@ Trunk (unreleased changes)
 
				 
			
 
				     HDFS-459. Introduce Job History Log Analyzer. (shv)
			
 
				 
			
 
				+    HDFS-461. Tool to analyze file size distribution in HDFS. (shv)
			
 
				+
			
 
				   IMPROVEMENTS
			
 
				 
			
 
				     HDFS-381. Remove blocks from DataNode maps when corresponding file
			
--- a/src/docs/src/documentation/content/xdocs/hdfs_imageviewer.xml
+++ b/src/docs/src/documentation/content/xdocs/hdfs_imageviewer.xml
@@ -70,6 +70,21 @@
 
				           of this processor is amenable to automated processing and analysis with XML tools.
			
 
				           Due to the verbosity of the XML syntax, this processor will also generate
			
 
				           the largest amount of output.</li>
			
 
				+        <li><strong>FileDistribution</strong> is the tool for analyzing file 
			
 
				+          sizes in the namespace image. In order to run the tool one should 
			
 
				+          define a range of integers <code>[0, maxSize]</code> by specifying
			
 
				+          <code>maxSize</code> and a <code>step</code>.
			
 
				+          The range of integers is divided into segments of size
			
 
				+          <code>step</code>:
			
 
				+          <code>[0, s</code><sub>1</sub><code>, ..., s</code><sub>n-1</sub><code>, maxSize]</code>, 
			
 
				+          and the processor calculates how many files in the system fall into 
			
 
				+          each segment <code>[s</code><sub>i-1</sub><code>, s</code><sub>i</sub><code>)</code>.
			
 
				+          Note that files larger than <code>maxSize</code> always fall into 
			
 
				+          the very last segment.
			
 
				+          The output file is formatted as a tab separated two column table:
			
 
				+          Size and NumFiles. Where Size represents the start of the segment,
			
 
				+          and numFiles is the number of files form the image which size falls
			
 
				+          in this segment.</li>
			
 
				         </ol>
			
 
				 
			
 
				     </section> <!-- overview -->
			
--- a/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/FileDistributionVisitor.java
+++ b/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/FileDistributionVisitor.java
@@ -0,0 +1,182 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.hdfs.tools.offlineImageViewer;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.LinkedList;
			
 
				+
			
 
				+/**
			
 
				+ * File size distribution visitor.
			
 
				+ * 
			
 
				+ * <h3>Description.</h3>
			
 
				+ * This is the tool for analyzing file sizes in the namespace image.
			
 
				+ * In order to run the tool one should define a range of integers
			
 
				+ * <tt>[0, maxSize]</tt> by specifying <tt>maxSize</tt> and a <tt>step</tt>.
			
 
				+ * The range of integers is divided into segments of size <tt>step</tt>: 
			
 
				+ * <tt>[0, s<sub>1</sub>, ..., s<sub>n-1</sub>, maxSize]</tt>,
			
 
				+ * and the visitor calculates how many files in the system fall into 
			
 
				+ * each segment <tt>[s<sub>i-1</sub>, s<sub>i</sub>)</tt>. 
			
 
				+ * Note that files larger than <tt>maxSize</tt> always fall into 
			
 
				+ * the very last segment.
			
 
				+ * 
			
 
				+ * <h3>Input.</h3>
			
 
				+ * <ul>
			
 
				+ * <li><tt>filename</tt> specifies the location of the image file;</li>
			
 
				+ * <li><tt>maxSize</tt> determines the range <tt>[0, maxSize]</tt> of files
			
 
				+ * sizes considered by the visitor;</li>
			
 
				+ * <li><tt>step</tt> the range is divided into segments of size step.</li>
			
 
				+ * </ul>
			
 
				+ *
			
 
				+ * <h3>Output.</h3>
			
 
				+ * The output file is formatted as a tab separated two column table:
			
 
				+ * Size and NumFiles. Where Size represents the start of the segment,
			
 
				+ * and numFiles is the number of files form the image which size falls in 
			
 
				+ * this segment.
			
 
				+ */
			
 
				+class FileDistributionVisitor extends TextWriterImageVisitor {
			
 
				+  final private LinkedList<ImageElement> elemS = new LinkedList<ImageElement>();
			
 
				+
			
 
				+  private final static long MAX_SIZE_DEFAULT = 0x2000000000L;   // 1/8 TB = 2^37
			
 
				+  private final static int INTERVAL_DEFAULT = 0x200000;         // 2 MB = 2^21
			
 
				+
			
 
				+  private int[] distribution;
			
 
				+  private long maxSize;
			
 
				+  private int step;
			
 
				+
			
 
				+  private int totalFiles;
			
 
				+  private int totalDirectories;
			
 
				+  private int totalBlocks;
			
 
				+  private long totalSpace;
			
 
				+  private long maxFileSize;
			
 
				+
			
 
				+  private FileContext current;
			
 
				+
			
 
				+  private boolean inInode = false;
			
 
				+
			
 
				+  /**
			
 
				+   * File or directory information.
			
 
				+   */
			
 
				+  private static class FileContext {
			
 
				+    String path;
			
 
				+    long fileSize;
			
 
				+    int numBlocks;
			
 
				+    int replication;
			
 
				+  }
			
 
				+
			
 
				+  public FileDistributionVisitor(String filename,
			
 
				+                                 long maxSize,
			
 
				+                                 int step) throws IOException {
			
 
				+    super(filename, false);
			
 
				+    this.maxSize = (maxSize == 0 ? MAX_SIZE_DEFAULT : maxSize);
			
 
				+    this.step = (step == 0 ? INTERVAL_DEFAULT : step);
			
 
				+    long numIntervals = this.maxSize / this.step;
			
 
				+    if(numIntervals >= Integer.MAX_VALUE)
			
 
				+      throw new IOException("Too many distribution intervals " + numIntervals);
			
 
				+    this.distribution = new int[1 + (int)(numIntervals)];
			
 
				+    this.totalFiles = 0;
			
 
				+    this.totalDirectories = 0;
			
 
				+    this.totalBlocks = 0;
			
 
				+    this.totalSpace = 0;
			
 
				+    this.maxFileSize = 0;
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  void start() throws IOException {}
			
 
				+
			
 
				+  @Override
			
 
				+  void finish() throws IOException {
			
 
				+    // write the distribution into the output file
			
 
				+    write("Size\tNumFiles\n");
			
 
				+    for(int i = 0; i < distribution.length; i++)
			
 
				+      write(((long)i * step) + "\t" + distribution[i] + "\n");
			
 
				+    System.out.println("totalFiles = " + totalFiles);
			
 
				+    System.out.println("totalDirectories = " + totalDirectories);
			
 
				+    System.out.println("totalBlocks = " + totalBlocks);
			
 
				+    System.out.println("totalSpace = " + totalSpace);
			
 
				+    System.out.println("maxFileSize = " + maxFileSize);
			
 
				+    super.finish();
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  void leaveEnclosingElement() throws IOException {
			
 
				+    ImageElement elem = elemS.pop();
			
 
				+
			
 
				+    if(elem != ImageElement.Inode &&
			
 
				+       elem != ImageElement.INodeUnderConstruction)
			
 
				+      return;
			
 
				+    inInode = false;
			
 
				+    if(current.numBlocks < 0) {
			
 
				+      totalDirectories ++;
			
 
				+      return;
			
 
				+    }
			
 
				+    totalFiles++;
			
 
				+    totalBlocks += current.numBlocks;
			
 
				+    totalSpace += current.fileSize * current.replication;
			
 
				+    if(maxFileSize < current.fileSize)
			
 
				+      maxFileSize = current.fileSize;
			
 
				+    int high;
			
 
				+    if(current.fileSize > maxSize)
			
 
				+      high = distribution.length-1;
			
 
				+    else
			
 
				+      high = (int)Math.ceil((double)current.fileSize / step);
			
 
				+    distribution[high]++;
			
 
				+    if(totalFiles % 1000000 == 1)
			
 
				+      System.out.println("Files processed: " + totalFiles
			
 
				+          + "  Current: " + current.path);
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  void visit(ImageElement element, String value) throws IOException {
			
 
				+    if(inInode) {
			
 
				+      switch(element) {
			
 
				+      case INodePath:
			
 
				+        current.path = (value.equals("") ? "/" : value);
			
 
				+        break;
			
 
				+      case Replication:
			
 
				+        current.replication = Integer.valueOf(value);
			
 
				+        break;
			
 
				+      case NumBytes:
			
 
				+        current.fileSize += Long.valueOf(value);
			
 
				+        break;
			
 
				+      default:
			
 
				+        break;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  void visitEnclosingElement(ImageElement element) throws IOException {
			
 
				+    elemS.push(element);
			
 
				+    if(element == ImageElement.Inode ||
			
 
				+       element == ImageElement.INodeUnderConstruction) {
			
 
				+      current = new FileContext();
			
 
				+      inInode = true;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  void visitEnclosingElement(ImageElement element,
			
 
				+      ImageElement key, String value) throws IOException {
			
 
				+    elemS.push(element);
			
 
				+    if(element == ImageElement.Inode ||
			
 
				+       element == ImageElement.INodeUnderConstruction)
			
 
				+      inInode = true;
			
 
				+    else if(element == ImageElement.Blocks)
			
 
				+      current.numBlocks = Integer.parseInt(value);
			
 
				+  }
			
 
				+}
			
--- a/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageVisitor.java
+++ b/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageVisitor.java
@@ -93,7 +93,7 @@ abstract class ImageVisitor {
 
				   abstract void finishAbnormally() throws IOException;
			
 
				 
			
 
				   /**
			
 
				-   * Visit element of fsimage with specified value.
			
 
				+   * Visit non enclosing element of fsimage with specified value.
			
 
				    *
			
 
				    * @param element FSImage element
			
 
				    * @param value Element's value
			
--- a/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/OfflineImageViewer.java
+++ b/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/OfflineImageViewer.java
@@ -67,6 +67,11 @@ public class OfflineImageViewer {
 
				     "  * XML: This processor creates an XML document with all elements of\n" +
			
 
				     "    the fsimage enumerated, suitable for further analysis by XML\n" +
			
 
				     "    tools.\n" +
			
 
				+    "  * FileDistribution: This processor analyzes the file size\n" +
			
 
				+    "    distribution in the image.\n" +
			
 
				+    "    -maxSize specifies the range [0, maxSize] of file sizes to be\n" +
			
 
				+    "     analyzed (128GB by default).\n" +
			
 
				+    "    -step defines the granularity of the distribution. (2MB by default)\n" +
			
 
				     "\n" + 
			
 
				     "Required command line arguments:\n" +
			
 
				     "-i,--inputFile <arg>   FSImage file to process.\n" +
			
@@ -75,7 +80,8 @@ public class OfflineImageViewer {
 
				     "\n" + 
			
 
				     "Optional command line arguments:\n" +
			
 
				     "-p,--processor <arg>   Select which type of processor to apply\n" +
			
 
				-    "                       against image file. (Ls|XML|Delimited|Indented).\n" +
			
 
				+    "                       against image file." +
			
 
				+    " (Ls|XML|Delimited|Indented|FileDistribution).\n" +
			
 
				     "-h,--help              Display usage information and exit\n" +
			
 
				     "-printToScreen         For processors that write to a file, also\n" +
			
 
				     "                       output to screen. On large image files this\n" +
			
@@ -223,6 +229,10 @@ public class OfflineImageViewer {
 
				                  new DelimitedImageVisitor(outputFile, printToScreen) :
			
 
				                  new DelimitedImageVisitor(outputFile, printToScreen, delimiter);
			
 
				       skipBlocks = false;
			
 
				+    } else if (processor.equals("FileDistribution")) {
			
 
				+      long maxSize = Long.parseLong(cmd.getOptionValue("maxSize", "0"));
			
 
				+      int step = Integer.parseInt(cmd.getOptionValue("step", "0"));
			
 
				+      v = new FileDistributionVisitor(outputFile, maxSize, step);
			
 
				     } else {
			
 
				       v = new LsImageVisitor(outputFile, printToScreen);
			
 
				       skipBlocks = false;
			
--- a/src/test/hdfs/org/apache/hadoop/hdfs/tools/offlineImageViewer/TestOfflineImageViewer.java
+++ b/src/test/hdfs/org/apache/hadoop/hdfs/tools/offlineImageViewer/TestOfflineImageViewer.java
@@ -52,6 +52,8 @@ import org.apache.hadoop.hdfs.protocol.FSConstants.SafeModeAction;
 
				  *     file that ends suddenly.
			
 
				  */
			
 
				 public class TestOfflineImageViewer extends TestCase {
			
 
				+  private static final int NUM_DIRS = 3;
			
 
				+  private static final int FILES_PER_DIR = 4;
			
 
				 
			
 
				   // Elements of lines of ls-file output to be compared to FileStatus instance
			
 
				   private class LsElements {
			
@@ -80,6 +82,7 @@ public class TestOfflineImageViewer extends TestCase {
 
				     
			
 
				     // Tests:
			
 
				     outputOfLSVisitor(originalFsimage);
			
 
				+    outputOfFileDistributionVisitor(originalFsimage);
			
 
				     
			
 
				     unsupportedFSLayoutVersion(originalFsimage);
			
 
				     
			
@@ -101,16 +104,14 @@ public class TestOfflineImageViewer extends TestCase {
 
				       cluster = new MiniDFSCluster(conf, 4, true, null);
			
 
				       FileSystem hdfs = cluster.getFileSystem();
			
 
				       
			
 
				-      int numDirs = 3;
			
 
				-      int numFilesPerDir = 4;
			
 
				       int filesize = 256;
			
 
				       
			
 
				       // Create a reasonable namespace 
			
 
				-      for(int i = 0; i < numDirs; i++)  {
			
 
				+      for(int i = 0; i < NUM_DIRS; i++)  {
			
 
				         Path dir = new Path("/dir" + i);
			
 
				         hdfs.mkdirs(dir);
			
 
				         writtenFiles.put(dir.toString(), pathToFileEntry(hdfs, dir.toString()));
			
 
				-        for(int j = 0; j < numFilesPerDir; j++) {
			
 
				+        for(int j = 0; j < FILES_PER_DIR; j++) {
			
 
				           Path file = new Path(dir, "file" + j);
			
 
				           FSDataOutputStream o = hdfs.create(file);
			
 
				           o.write(new byte[ filesize++ ]);
			
@@ -369,4 +370,34 @@ public class TestOfflineImageViewer extends TestCase {
 
				       if(out != null) out.close();
			
 
				     }
			
 
				   }
			
 
				+
			
 
				+  private void outputOfFileDistributionVisitor(File originalFsimage) {
			
 
				+    File testFile = new File(ROOT, "/basicCheck");
			
 
				+    File outputFile = new File(ROOT, "/fileDistributionCheckOutput");
			
 
				+
			
 
				+    int totalFiles = 0;
			
 
				+    try {
			
 
				+      copyFile(originalFsimage, testFile);
			
 
				+      ImageVisitor v = new FileDistributionVisitor(outputFile.getPath(), 0, 0);
			
 
				+      OfflineImageViewer oiv = 
			
 
				+        new OfflineImageViewer(testFile.getPath(), v, false);
			
 
				+
			
 
				+      oiv.go();
			
 
				+
			
 
				+      BufferedReader reader = new BufferedReader(new FileReader(outputFile));
			
 
				+      String line = reader.readLine();
			
 
				+      assertEquals(line, "Size\tNumFiles");
			
 
				+      while((line = reader.readLine()) != null) {
			
 
				+        String[] row = line.split("\t");
			
 
				+        assertEquals(row.length, 2);
			
 
				+        totalFiles += Integer.parseInt(row[1]);
			
 
				+      }
			
 
				+    } catch (IOException e) {
			
 
				+      fail("Failed reading valid file: " + e.getMessage());
			
 
				+    } finally {
			
 
				+      if(testFile.exists()) testFile.delete();
			
 
				+      if(outputFile.exists()) outputFile.delete();
			
 
				+    }
			
 
				+    assertEquals(totalFiles, NUM_DIRS * FILES_PER_DIR);
			
 
				+  }
			
 
				 }