Pārlūkot izejas kodu

MAPREDUCE-3310. Custom grouping comparator cannot be set for Combiners (tucu)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1@1555972 13f79535-47bb-0310-9956-ffa450edef68
Alejandro Abdelnur 11 gadi atpakaļ
vecāks
revīzija
b2199a360e

+ 3 - 0
CHANGES.txt

@@ -49,6 +49,9 @@ Release 1.3.0 - unreleased
     MAPREDUCE-5609. Add debug log message when sending job end notification. 
     (rkanter via tucu)
 
+    MAPREDUCE-3310. Custom grouping comparator cannot be set for Combiners 
+    (tucu)
+
   BUG FIXES
 
     HADOOP-9863. Backport HADOOP-8686 to support BigEndian on ppc64. 

+ 52 - 5
src/mapred/org/apache/hadoop/mapred/JobConf.java

@@ -21,10 +21,6 @@ package org.apache.hadoop.mapred;
 
 import java.io.IOException;
 
-import java.net.URL;
-import java.net.URLDecoder;
-import java.util.Enumeration;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.filecache.DistributedCache;
@@ -883,6 +879,24 @@ public class JobConf extends Configuration {
     return get("mapred.text.key.partitioner.options");
   }
 
+  /**
+   * Get the user defined {@link WritableComparable} comparator for
+   * grouping keys of inputs to the combiner.
+   *
+   * @return comparator set by the user for grouping values.
+   * @see #setCombinerKeyGroupingComparator(Class) for details.
+   */
+  public RawComparator getCombinerKeyGroupingComparator() {
+    Class<? extends RawComparator> theClass = getClass(
+        "mapred.combiner.groupfn.class", null, RawComparator.class);
+    if (theClass == null) {
+      return getOutputKeyComparator();
+    }
+
+    return ReflectionUtils.newInstance(theClass, this);
+  }
+
+
   /** 
    * Get the user defined {@link WritableComparable} comparator for 
    * grouping keys of inputs to the reduce.
@@ -900,6 +914,37 @@ public class JobConf extends Configuration {
     return ReflectionUtils.newInstance(theClass, this);
   }
 
+
+  /**
+   * Set the user defined {@link RawComparator} comparator for
+   * grouping keys in the input to the combiner.
+   * <p/>
+   * <p>This comparator should be provided if the equivalence rules for keys
+   * for sorting the intermediates are different from those for grouping keys
+   * before each call to
+   * {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.</p>
+   * <p/>
+   * <p>For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+   * in a single call to the reduce function if K1 and K2 compare as equal.</p>
+   * <p/>
+   * <p>Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+   * how keys are sorted, this can be used in conjunction to simulate
+   * <i>secondary sort on values</i>.</p>
+   * <p/>
+   * <p><i>Note</i>: This is not a guarantee of the combiner sort being
+   * <i>stable</i> in any sense. (In any case, with the order of available
+   * map-outputs to the combiner being non-deterministic, it wouldn't make
+   * that much sense.)</p>
+   *
+   * @param theClass the comparator class to be used for grouping keys for the
+   * combiner. It should implement <code>RawComparator</code>.
+   * @see #setOutputKeyComparatorClass(Class)
+   */
+  public void setCombinerKeyGroupingComparator(
+      Class<? extends RawComparator> theClass) {
+    setClass("mapred.combiner.groupfn.class", theClass, RawComparator.class);
+  }
+
   /** 
    * Set the user defined {@link RawComparator} comparator for 
    * grouping keys in the input to the reduce.
@@ -923,7 +968,9 @@ public class JobConf extends Configuration {
    * 
    * @param theClass the comparator class to be used for grouping keys. 
    *                 It should implement <code>RawComparator</code>.
-   * @see #setOutputKeyComparatorClass(Class)                 
+   * @see #setOutputKeyComparatorClass(Class)
+   * @see {@link #setCombinerKeyGroupingComparator(Class)} for setting a 
+   * comparator for the combiner.
    */
   public void setOutputValueGroupingComparator(
 		  Class<? extends RawComparator> theClass) {

+ 3 - 3
src/mapred/org/apache/hadoop/mapred/Task.java

@@ -56,7 +56,6 @@ import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.ResourceCalculatorPlugin;
 import org.apache.hadoop.util.ResourceCalculatorPlugin.ProcResourceValues;
 import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.fs.FSDataInputStream;
 
 /** 
  * Base class for tasks.
@@ -1425,7 +1424,8 @@ abstract public class Task implements Writable, Configurable {
       combinerClass = cls;
       keyClass = (Class<K>) job.getMapOutputKeyClass();
       valueClass = (Class<V>) job.getMapOutputValueClass();
-      comparator = (RawComparator<K>) job.getOutputKeyComparator();
+      comparator = (RawComparator<K>)
+          job.getCombinerKeyGroupingComparator();
     }
 
     @SuppressWarnings("unchecked")
@@ -1472,7 +1472,7 @@ abstract public class Task implements Writable, Configurable {
       this.taskId = taskId;
       keyClass = (Class<K>) context.getMapOutputKeyClass();
       valueClass = (Class<V>) context.getMapOutputValueClass();
-      comparator = (RawComparator<K>) context.getSortComparator();
+      comparator = (RawComparator<K>) context.getCombinerKeyGroupingComparator();
       this.committer = committer;
     }
 

+ 17 - 1
src/mapred/org/apache/hadoop/mapreduce/Job.java

@@ -24,7 +24,6 @@ import java.security.PrivilegedExceptionAction;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.RawComparator;
-import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.RunningJob;
@@ -288,6 +287,21 @@ public class Job extends JobContext {
     conf.setOutputKeyComparatorClass(cls);
   }
 
+  /**
+   * Define the comparator that controls which keys are grouped together
+   * for a single call to combiner,
+   * {@link Reducer#reduce(Object, Iterable,
+   * org.apache.hadoop.mapreduce.Reducer.Context)}
+   *
+   * @param cls the raw comparator to use
+   * @throws IllegalStateException if the job is submitted
+   */
+  public void setCombinerKeyGroupingComparatorClass(
+      Class<? extends RawComparator> cls) throws IllegalStateException {
+    ensureState(JobState.DEFINE);
+    conf.setCombinerKeyGroupingComparator(cls);
+  }
+
   /**
    * Define the comparator that controls which keys are grouped together
    * for a single call to 
@@ -295,6 +309,8 @@ public class Job extends JobContext {
    *                       org.apache.hadoop.mapreduce.Reducer.Context)}
    * @param cls the raw comparator to use
    * @throws IllegalStateException if the job is submitted
+   * @see {@link #setCombinerKeyGroupingComparatorClass(Class)} for setting a 
+   * comparator for the combiner.
    */
   public void setGroupingComparatorClass(Class<? extends RawComparator> cls
                                          ) throws IllegalStateException {

+ 13 - 1
src/mapred/org/apache/hadoop/mapreduce/JobContext.java

@@ -23,7 +23,6 @@ import java.io.IOException;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.RawComparator;
-import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
@@ -268,12 +267,25 @@ public class JobContext {
     return conf.getJar();
   }
 
+  /**
+   * Get the user defined {@link RawComparator} comparator for
+   * grouping keys of inputs to the combiner.
+   *
+   * @return comparator set by the user for grouping values.
+   * @see Job#setCombinerKeyGroupingComparatorClass(Class) for details.
+   */
+  public RawComparator<?> getCombinerKeyGroupingComparator() {
+    return conf.getCombinerKeyGroupingComparator();
+  }
+
   /** 
    * Get the user defined {@link RawComparator} comparator for 
    * grouping keys of inputs to the reduce.
    * 
    * @return comparator set by the user for grouping values.
    * @see Job#setGroupingComparatorClass(Class) for details.  
+   * @see {@link #getCombinerKeyGroupingComparator()} for setting a 
+   * comparator for the combiner.
    */
   public RawComparator<?> getGroupingComparator() {
     return conf.getOutputValueGroupingComparator();

+ 189 - 0
src/test/org/apache/hadoop/mapred/TestOldCombinerGrouping.java

@@ -0,0 +1,189 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import junit.framework.Assert;
+import junit.framework.TestCase;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.Text;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.UUID;
+
+public class TestOldCombinerGrouping extends TestCase {
+  private static String TEST_ROOT_DIR =
+      new File("build", UUID.randomUUID().toString()).getAbsolutePath();
+
+  public static class Map implements
+      Mapper<LongWritable, Text, Text, LongWritable> {
+    @Override
+    public void map(LongWritable key, Text value,
+        OutputCollector<Text, LongWritable> output, Reporter reporter)
+        throws IOException {
+      String v = value.toString();
+      String k = v.substring(0, v.indexOf(","));
+      v = v.substring(v.indexOf(",") + 1);
+      output.collect(new Text(k), new LongWritable(Long.parseLong(v)));
+    }
+
+    @Override
+    public void close() throws IOException {
+    }
+
+    @Override
+    public void configure(JobConf job) {
+    }
+  }
+
+  public static class Reduce implements
+      Reducer<Text, LongWritable, Text, LongWritable> {
+
+    @Override
+    public void reduce(Text key, Iterator<LongWritable> values,
+        OutputCollector<Text, LongWritable> output, Reporter reporter)
+        throws IOException {
+      LongWritable maxValue = null;
+      while (values.hasNext()) {
+        LongWritable value = values.next();
+        if (maxValue == null) {
+          maxValue = value;
+        } else if (value.compareTo(maxValue) > 0) {
+          maxValue = value;
+        }
+      }
+      output.collect(key, maxValue);
+    }
+
+    @Override
+    public void close() throws IOException {
+    }
+
+    @Override
+    public void configure(JobConf job) {
+    }
+  }
+
+  public static class Combiner extends Reduce {
+  }
+
+  public static class GroupComparator implements RawComparator<Text> {
+    @Override
+    public int compare(byte[] bytes, int i, int i2, byte[] bytes2, int i3,
+        int i4) {
+      byte[] b1 = new byte[i2];
+      System.arraycopy(bytes, i, b1, 0, i2);
+
+      byte[] b2 = new byte[i4];
+      System.arraycopy(bytes2, i3, b2, 0, i4);
+
+      return compare(new Text(new String(b1)), new Text(new String(b2)));
+    }
+
+    @Override
+    public int compare(Text o1, Text o2) {
+      String s1 = o1.toString();
+      String s2 = o2.toString();
+      s1 = s1.substring(0, s1.indexOf("|"));
+      s2 = s2.substring(0, s2.indexOf("|"));
+      return s1.compareTo(s2);
+    }
+
+  }
+
+  public void testCombiner() throws Exception {
+    if (!new File(TEST_ROOT_DIR).mkdirs()) {
+      throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR);
+    }
+    File in = new File(TEST_ROOT_DIR, "input");
+    if (!in.mkdirs()) {
+      throw new RuntimeException("Could not create test dir: " + in);
+    }
+    File out = new File(TEST_ROOT_DIR, "output");
+    PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt")));
+    pw.println("A|a,1");
+    pw.println("A|b,2");
+    pw.println("B|a,3");
+    pw.println("B|b,4");
+    pw.println("B|c,5");
+    pw.close();
+    JobConf job = new JobConf();
+    TextInputFormat.setInputPaths(job, new Path(in.getPath()));
+    TextOutputFormat.setOutputPath(job, new Path(out.getPath()));
+    job.setMapperClass(Map.class);
+    job.setReducerClass(Reduce.class);
+    job.setInputFormat(TextInputFormat.class);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(LongWritable.class);
+    job.setOutputFormat(TextOutputFormat.class);
+    job.setOutputValueGroupingComparator(GroupComparator.class);
+
+    job.setCombinerClass(Combiner.class);
+    job.setCombinerKeyGroupingComparator(GroupComparator.class);
+    job.setInt("min.num.spills.for.combine", 0);
+
+    JobClient client = new JobClient(job);
+    RunningJob runningJob = client.submitJob(job);
+    runningJob.waitForCompletion();
+    if (runningJob.isSuccessful()) {
+      Counters counters = runningJob.getCounters();
+
+      long combinerInputRecords = counters.getGroup(
+          "org.apache.hadoop.mapred.Task$Counter").
+          getCounterForName("COMBINE_INPUT_RECORDS").getValue();
+      long combinerOutputRecords = counters.getGroup(
+          "org.apache.hadoop.mapred.Task$Counter").
+          getCounterForName("COMBINE_OUTPUT_RECORDS").getValue();
+      Assert.assertTrue(combinerInputRecords > 0);
+      Assert.assertTrue(combinerInputRecords > combinerOutputRecords);
+
+      BufferedReader br = new BufferedReader(new FileReader(
+          new File(out, "part-00000")));
+      Set<String> output = new HashSet<String>();
+      String line = br.readLine();
+      Assert.assertNotNull(line);
+      output.add(line.substring(0, 1) + line.substring(4, 5));
+      line = br.readLine();
+      Assert.assertNotNull(line);
+      output.add(line.substring(0, 1) + line.substring(4, 5));
+      line = br.readLine();
+      Assert.assertNull(line);
+      br.close();
+
+      Set<String> expected = new HashSet<String>();
+      expected.add("A2");
+      expected.add("B5");
+
+      Assert.assertEquals(expected, output);
+
+    } else {
+      Assert.fail("Job failed");
+    }
+  }
+
+}

+ 174 - 0
src/test/org/apache/hadoop/mapreduce/TestNewCombinerGrouping.java

@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce;
+
+import junit.framework.Assert;
+import junit.framework.TestCase;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.UUID;
+
+public class TestNewCombinerGrouping extends TestCase {
+  private static String TEST_ROOT_DIR =
+      new File("build", UUID.randomUUID().toString()).getAbsolutePath();
+
+  public static class Map extends
+      Mapper<LongWritable, Text, Text, LongWritable> {
+
+    @Override
+    protected void map(LongWritable key, Text value,
+        Context context)
+        throws IOException, InterruptedException {
+      String v = value.toString();
+      String k = v.substring(0, v.indexOf(","));
+      v = v.substring(v.indexOf(",") + 1);
+      context.write(new Text(k), new LongWritable(Long.parseLong(v)));
+    }
+  }
+
+  public static class Reduce extends
+      Reducer<Text, LongWritable, Text, LongWritable> {
+
+    @Override
+    protected void reduce(Text key, Iterable<LongWritable> values,
+        Context context)
+        throws IOException, InterruptedException {
+      LongWritable maxValue = null;
+      for (LongWritable value : values) {
+        if (maxValue == null) {
+          maxValue = value;
+        } else if (value.compareTo(maxValue) > 0) {
+          maxValue = value;
+        }
+      }
+      context.write(key, maxValue);
+    }
+  }
+
+  public static class Combiner extends Reduce {
+  }
+
+  public static class GroupComparator implements RawComparator<Text> {
+    @Override
+    public int compare(byte[] bytes, int i, int i2, byte[] bytes2, int i3,
+        int i4) {
+      byte[] b1 = new byte[i2];
+      System.arraycopy(bytes, i, b1, 0, i2);
+
+      byte[] b2 = new byte[i4];
+      System.arraycopy(bytes2, i3, b2, 0, i4);
+
+      return compare(new Text(new String(b1)), new Text(new String(b2)));
+    }
+
+    @Override
+    public int compare(Text o1, Text o2) {
+      String s1 = o1.toString();
+      String s2 = o2.toString();
+      s1 = s1.substring(0, s1.indexOf("|"));
+      s2 = s2.substring(0, s2.indexOf("|"));
+      return s1.compareTo(s2);
+    }
+
+  }
+
+  public void testCombiner() throws Exception {
+    if (!new File(TEST_ROOT_DIR).mkdirs()) {
+      throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR);
+    }
+    File in = new File(TEST_ROOT_DIR, "input");
+    if (!in.mkdirs()) {
+      throw new RuntimeException("Could not create test dir: " + in);
+    }
+    File out = new File(TEST_ROOT_DIR, "output");
+    PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt")));
+    pw.println("A|a,1");
+    pw.println("A|b,2");
+    pw.println("B|a,3");
+    pw.println("B|b,4");
+    pw.println("B|c,5");
+    pw.close();
+    Job job = new Job();
+    TextInputFormat.setInputPaths(job, new Path(in.getPath()));
+    TextOutputFormat.setOutputPath(job, new Path(out.getPath()));
+
+    job.setMapperClass(Map.class);
+    job.setReducerClass(Reduce.class);
+    job.setInputFormatClass(TextInputFormat.class);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(LongWritable.class);
+    job.setOutputFormatClass(TextOutputFormat.class);
+    job.setGroupingComparatorClass(GroupComparator.class);
+
+    job.setCombinerKeyGroupingComparatorClass(GroupComparator.class);
+    job.setCombinerClass(Combiner.class);
+    job.getConfiguration().setInt("min.num.spills.for.combine", 0);
+
+    job.submit();
+    job.waitForCompletion(false);
+    if (job.isSuccessful()) {
+      Counters counters = job.getCounters();
+
+      long combinerInputRecords = counters.findCounter(
+          "org.apache.hadoop.mapred.Task$Counter",
+          "COMBINE_INPUT_RECORDS").getValue();
+      long combinerOutputRecords = counters.findCounter(
+          "org.apache.hadoop.mapred.Task$Counter",
+          "COMBINE_OUTPUT_RECORDS").getValue();
+      Assert.assertTrue(combinerInputRecords > 0);
+      Assert.assertTrue(combinerInputRecords > combinerOutputRecords);
+
+      BufferedReader br = new BufferedReader(new FileReader(
+          new File(out, "part-r-00000")));
+      Set<String> output = new HashSet<String>();
+      String line = br.readLine();
+      Assert.assertNotNull(line);
+      output.add(line.substring(0,1) + line.substring(4, 5));
+      line = br.readLine();
+      Assert.assertNotNull(line);
+      output.add(line.substring(0, 1) + line.substring(4, 5));
+      line = br.readLine();
+      Assert.assertNull(line);
+      br.close();
+
+      Set<String> expected = new HashSet<String>();
+      expected.add("A2");
+      expected.add("B5");
+
+      Assert.assertEquals(expected, output);
+
+    } else {
+      Assert.fail("Job failed");
+    }
+  }
+
+}