Forráskód Böngészése

HADOOP-2055. Allows users to set PathFilter on the FileInputFormat. Contributed by Alejandro Abdelnur.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/core/trunk@642211 13f79535-47bb-0310-9956-ffa450edef68
Devaraj Das 17 éve
szülő
commit
2e5acf3bd9
2 módosított fájl, 62 hozzáadás és 2 törlés
  1. 3 0
      CHANGES.txt
  2. 59 2
      src/java/org/apache/hadoop/mapred/FileInputFormat.java

+ 3 - 0
CHANGES.txt

@@ -84,6 +84,9 @@ Trunk (unreleased changes)
     HADOOP-1622.  Allow multiple jar files for map reduce.
     (Mahadev Konar via dhruba)
 
+    HADOOP-2055. Allows users to set PathFilter on the FileInputFormat.
+    (Alejandro Abdelnur via ddas)
+
   IMPROVEMENTS
 
     HADOOP-2655. Copy on write for data and metadata files in the 

+ 59 - 2
src/java/org/apache/hadoop/mapred/FileInputFormat.java

@@ -29,6 +29,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.util.ReflectionUtils;
 
 /** 
  * A base class for file-based {@link InputFormat}.
@@ -58,6 +59,28 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
     this.minSplitSize = minSplitSize;
   }
 
+  /**
+   * Proxy PathFilter that accepts a path only if all filters given in the
+   * constructor do. Used by the listPaths() to apply the built-in
+   * hiddenFileFilter together with a user provided one (if any).
+   */
+  private static class MultiPathFilter implements PathFilter {
+    private List<PathFilter> filters;
+
+    public MultiPathFilter(List<PathFilter> filters) {
+      this.filters = filters;
+    }
+
+    public boolean accept(Path path) {
+      for (PathFilter filter : filters) {
+        if (!filter.accept(path)) {
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+
   /**
    * Is the given filename splitable? Usually, true, but if the file is
    * stream compressed, it will not be.
@@ -79,6 +102,28 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
                                                Reporter reporter)
     throws IOException;
 
+  /**
+   * Set a PathFilter to be applied to the input paths for the map-reduce job.
+   *
+   * @param filter the PathFilter class use for filtering the input paths.
+   */
+  public static void setInputPathFilter(JobConf conf,
+                                        Class<? extends PathFilter> filter) {
+    conf.setClass("mapred.input.pathFilter.class", filter, PathFilter.class);
+  }
+
+  /**
+   * Get a PathFilter instance of the filter set for the input paths.
+   *
+   * @return the PathFilter instance set for the job, NULL if none has been set.
+   */
+  public static PathFilter getInputPathFilter(JobConf conf) {
+    Class filterClass = conf.getClass("mapred.input.pathFilter.class", null,
+        PathFilter.class);
+    return (filterClass != null) ?
+        (PathFilter) ReflectionUtils.newInstance(filterClass, conf) : null;
+  }
+
   /** List input directories.
    * Subclasses may override to, e.g., select only files matching a regular
    * expression. 
@@ -93,11 +138,23 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
     if (dirs.length == 0) {
       throw new IOException("No input paths specified in job");
     }
-    List<Path> result = new ArrayList<Path>(); 
+
+    List<Path> result = new ArrayList<Path>();
+
+    // creates a MultiPathFilter with the hiddenFileFilter and the
+    // user provided one (if any).
+    List<PathFilter> filters = new ArrayList<PathFilter>();
+    filters.add(hiddenFileFilter);
+    PathFilter jobFilter = getInputPathFilter(job);
+    if (jobFilter != null) {
+      filters.add(jobFilter);
+    }
+    PathFilter inputFilter = new MultiPathFilter(filters);
+
     for (Path p: dirs) {
       FileSystem fs = p.getFileSystem(job); 
       Path[] matches =
-        fs.listPaths(fs.globPaths(p, hiddenFileFilter), hiddenFileFilter);
+        fs.listPaths(fs.globPaths(p, inputFilter), inputFilter);
       for (Path match: matches) {
         result.add(fs.makeQualified(match));
       }