10 éve · 9d72f93975
--- a/hadoop-mapreduce-project/CHANGES.txt
+++ b/hadoop-mapreduce-project/CHANGES.txt
@@ -253,6 +253,8 @@ Release 2.8.0 - UNRELEASED
 
															   IMPROVEMENTS
														
 
															+    MAPREDUCE-5807. Print usage by TeraSort job. (Rohith via harsh)
														
 
															+
														
 
															     MAPREDUCE-4653. TestRandomAlgorithm has an unused "import" statement.
														
 
															     (Amir Sanjar via harsh)
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java
@@ -70,7 +70,6 @@ public class TeraGen extends Configured implements Tool {
 
															   public static enum Counters {CHECKSUM}
														
 
															-  public static final String NUM_ROWS = "mapreduce.terasort.num-rows";
														
 
															   /**
														
 
															    * An input format that assigns ranges of longs to each mapper.
														
 
															    */
														
@@ -189,11 +188,12 @@ public class TeraGen extends Configured implements Tool {
 
															   }
														
 
															   static long getNumberOfRows(JobContext job) {
														
 
															-    return job.getConfiguration().getLong(NUM_ROWS, 0);
														
 
															+    return job.getConfiguration().getLong(TeraSortConfigKeys.NUM_ROWS.key(),
														
 
															+        TeraSortConfigKeys.DEFAULT_NUM_ROWS);
														
 
															   }
														
 
															   static void setNumberOfRows(Job job, long numRows) {
														
 
															-    job.getConfiguration().setLong(NUM_ROWS, numRows);
														
 
															+    job.getConfiguration().setLong(TeraSortConfigKeys.NUM_ROWS.key(), numRows);
														
 
															   }
														
 
															   /**
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraInputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraInputFormat.java
@@ -50,10 +50,6 @@ import org.apache.hadoop.util.StringUtils;
 
															 public class TeraInputFormat extends FileInputFormat<Text,Text> {
														
 
															   static final String PARTITION_FILENAME = "_partition.lst";
														
 
															-  private static final String NUM_PARTITIONS = 
														
 
															-    "mapreduce.terasort.num.partitions";
														
 
															-  private static final String SAMPLE_SIZE = 
														
 
															-    "mapreduce.terasort.partitions.sample";
														
 
															   static final int KEY_LENGTH = 10;
														
 
															   static final int VALUE_LENGTH = 90;
														
 
															   static final int RECORD_LENGTH = KEY_LENGTH + VALUE_LENGTH;
														
@@ -123,11 +119,16 @@ public class TeraInputFormat extends FileInputFormat<Text,Text> {
 
															     final TeraInputFormat inFormat = new TeraInputFormat();
														
 
															     final TextSampler sampler = new TextSampler();
														
 
															     int partitions = job.getNumReduceTasks();
														
 
															-    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
														
 
															+    long sampleSize =
														
 
															+        conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(),
														
 
															+            TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
														
 
															     final List<InputSplit> splits = inFormat.getSplits(job);
														
 
															     long t2 = System.currentTimeMillis();
														
 
															     System.out.println("Computing input splits took " + (t2 - t1) + "ms");
														
 
															-    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
														
 
															+    int samples =
														
 
															+        Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(),
														
 
															+                             TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS),
														
 
															+            splits.size());
														
 
															     System.out.println("Sampling " + samples + " splits of " + splits.size());
														
 
															     final long recordsPerSample = sampleSize / samples;
														
 
															     final int sampleStep = splits.size() / samples;
														
@@ -294,7 +295,8 @@ public class TeraInputFormat extends FileInputFormat<Text,Text> {
 
															     lastResult = super.getSplits(job);
														
 
															     t2 = System.currentTimeMillis();
														
 
															     System.out.println("Spent " + (t2 - t1) + "ms computing base-splits.");
														
 
															-    if (job.getConfiguration().getBoolean(TeraScheduler.USE, true)) {
														
 
															+    if (job.getConfiguration().getBoolean(TeraSortConfigKeys.USE_TERA_SCHEDULER.key(),
														
 
															+                                          TeraSortConfigKeys.DEFAULT_USE_TERA_SCHEDULER)) {
														
 
															       TeraScheduler scheduler = new TeraScheduler(
														
 
															         lastResult.toArray(new FileSplit[0]), job.getConfiguration());
														
 
															       lastResult = scheduler.getNewFileSplits();
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraOutputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraOutputFormat.java
@@ -40,21 +40,23 @@ import org.apache.hadoop.mapreduce.security.TokenCache;
 
															  * An output format that writes the key and value appended together.
														
 
															  */
														
 
															 public class TeraOutputFormat extends FileOutputFormat<Text,Text> {
														
 
															-  static final String FINAL_SYNC_ATTRIBUTE = "mapreduce.terasort.final.sync";
														
 
															   private OutputCommitter committer = null;
														
 
															   /**
														
 
															    * Set the requirement for a final sync before the stream is closed.
														
 
															    */
														
 
															   static void setFinalSync(JobContext job, boolean newValue) {
														
 
															-    job.getConfiguration().setBoolean(FINAL_SYNC_ATTRIBUTE, newValue);
														
 
															+    job.getConfiguration().setBoolean(
														
 
															+        TeraSortConfigKeys.FINAL_SYNC_ATTRIBUTE.key(), newValue);
														
 
															   }
														
 
															   /**
														
 
															    * Does the user want a final sync at close?
														
 
															    */
														
 
															   public static boolean getFinalSync(JobContext job) {
														
 
															-    return job.getConfiguration().getBoolean(FINAL_SYNC_ATTRIBUTE, false);
														
 
															+    return job.getConfiguration().getBoolean(
														
 
															+        TeraSortConfigKeys.FINAL_SYNC_ATTRIBUTE.key(),
														
 
															+        TeraSortConfigKeys.DEFAULT_FINAL_SYNC_ATTRIBUTE);
														
 
															   }
														
 
															   static class TeraRecordWriter extends RecordWriter<Text,Text> {
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraScheduler.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraScheduler.java
@@ -31,7 +31,6 @@ import org.apache.hadoop.mapreduce.server.tasktracker.TTConfig;
 
															 import com.google.common.base.Charsets;
														
 
															 class TeraScheduler {
														
 
															-  static String USE = "mapreduce.terasort.use.terascheduler";
														
 
															   private static final Log LOG = LogFactory.getLog(TeraScheduler.class);
														
 
															   private Split[] splits;
														
 
															   private List<Host> hosts = new ArrayList<Host>();
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSort.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSort.java
@@ -48,8 +48,6 @@ import org.apache.hadoop.util.ToolRunner;
 
															  */
														
 
															 public class TeraSort extends Configured implements Tool {
														
 
															   private static final Log LOG = LogFactory.getLog(TeraSort.class);
														
 
															-  static String SIMPLE_PARTITIONER = "mapreduce.terasort.simplepartitioner";
														
 
															-  static String OUTPUT_REPLICATION = "mapreduce.terasort.output.replication";
														
 
															   /**
														
 
															    * A partitioner that splits text keys into roughly equal partitions
														
@@ -262,22 +260,40 @@ public class TeraSort extends Configured implements Tool {
 
															   }
														
 
															   public static boolean getUseSimplePartitioner(JobContext job) {
														
 
															-    return job.getConfiguration().getBoolean(SIMPLE_PARTITIONER, false);
														
 
															+    return job.getConfiguration().getBoolean(
														
 
															+        TeraSortConfigKeys.USE_SIMPLE_PARTITIONER.key(),
														
 
															+        TeraSortConfigKeys.DEFAULT_USE_SIMPLE_PARTITIONER);
														
 
															   }
														
 
															   public static void setUseSimplePartitioner(Job job, boolean value) {
														
 
															-    job.getConfiguration().setBoolean(SIMPLE_PARTITIONER, value);
														
 
															+    job.getConfiguration().setBoolean(
														
 
															+        TeraSortConfigKeys.USE_SIMPLE_PARTITIONER.key(), value);
														
 
															   }
														
 
															   public static int getOutputReplication(JobContext job) {
														
 
															-    return job.getConfiguration().getInt(OUTPUT_REPLICATION, 1);
														
 
															+    return job.getConfiguration().getInt(
														
 
															+        TeraSortConfigKeys.OUTPUT_REPLICATION.key(),
														
 
															+        TeraSortConfigKeys.DEFAULT_OUTPUT_REPLICATION);
														
 
															   }
														
 
															   public static void setOutputReplication(Job job, int value) {
														
 
															-    job.getConfiguration().setInt(OUTPUT_REPLICATION, value);
														
 
															+    job.getConfiguration().setInt(TeraSortConfigKeys.OUTPUT_REPLICATION.key(),
														
 
															+        value);
														
 
															+  }
														
 
															+
														
 
															+  private static void usage() throws IOException {
														
 
															+    System.err.println("Usage: terasort [-Dproperty=value] <in> <out>");
														
 
															+    System.err.println("TeraSort configurations are:");
														
 
															+    for (TeraSortConfigKeys teraSortConfigKeys : TeraSortConfigKeys.values()) {
														
 
															+      System.err.println(teraSortConfigKeys.toString());
														
 
															+    }
														
 
															   }
														
 
															   public int run(String[] args) throws Exception {
														
 
															+    if (args.length != 2) {
														
 
															+      usage();
														
 
															+      return 2;
														
 
															+    }
														
 
															     LOG.info("starting");
														
 
															     Job job = Job.getInstance(getConf());
														
 
															     Path inputDir = new Path(args[0]);
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSortConfigKeys.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSortConfigKeys.java
@@ -0,0 +1,77 @@
 
															+/**
														
 
															+ * Licensed to the Apache Software Foundation (ASF) under one
														
 
															+ * or more contributor license agreements.  See the NOTICE file
														
 
															+ * distributed with this work for additional information
														
 
															+ * regarding copyright ownership.  The ASF licenses this file
														
 
															+ * to you under the Apache License, Version 2.0 (the
														
 
															+ * "License"); you may not use this file except in compliance
														
 
															+ * with the License.  You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+
														
 
															+package org.apache.hadoop.examples.terasort;
														
 
															+
														
 
															+import org.apache.hadoop.classification.InterfaceAudience.Private;
														
 
															+import org.apache.hadoop.classification.InterfaceStability.Unstable;
														
 
															+
														
 
															+/**
														
 
															+ * <p>
														
 
															+ * TeraSort configurations.
														
 
															+ * </p>
														
 
															+ */
														
 
															+@Private
														
 
															+@Unstable
														
 
															+public enum TeraSortConfigKeys {
														
 
															+
														
 
															+  NUM_ROWS("mapreduce.terasort.num-rows",
														
 
															+      "Number of rows to generate during teragen."),
														
 
															+
														
 
															+  NUM_PARTITIONS("mapreduce.terasort.num.partitions",
														
 
															+      "Number of partitions used for sampling."),
														
 
															+
														
 
															+  SAMPLE_SIZE("mapreduce.terasort.partitions.sample",
														
 
															+      "Sample size for each partition."),
														
 
															+
														
 
															+  FINAL_SYNC_ATTRIBUTE("mapreduce.terasort.final.sync",
														
 
															+      "Perform a disk-persisting hsync at end of every file-write."),
														
 
															+
														
 
															+  USE_TERA_SCHEDULER("mapreduce.terasort.use.terascheduler",
														
 
															+      "Use TeraScheduler for computing input split distribution."),
														
 
															+
														
 
															+  USE_SIMPLE_PARTITIONER("mapreduce.terasort.simplepartitioner",
														
 
															+      "Use SimplePartitioner instead of TotalOrderPartitioner."),
														
 
															+
														
 
															+  OUTPUT_REPLICATION("mapreduce.terasort.output.replication",
														
 
															+      "Replication factor to use for output data files.");
														
 
															+
														
 
															+  private String confName;
														
 
															+  private String description;
														
 
															+
														
 
															+  TeraSortConfigKeys(String configName, String description) {
														
 
															+    this.confName = configName;
														
 
															+    this.description = description;
														
 
															+  }
														
 
															+
														
 
															+  public String key() {
														
 
															+    return this.confName;
														
 
															+  }
														
 
															+
														
 
															+  public String toString() {
														
 
															+    return "<" + confName + ">     " + description;
														
 
															+  }
														
 
															+
														
 
															+  public static final long DEFAULT_NUM_ROWS = 0L;
														
 
															+  public static final int DEFAULT_NUM_PARTITIONS = 10;
														
 
															+  public static final long DEFAULT_SAMPLE_SIZE = 100000L;
														
 
															+  public static final boolean DEFAULT_FINAL_SYNC_ATTRIBUTE = false;
														
 
															+  public static final boolean DEFAULT_USE_TERA_SCHEDULER = true;
														
 
															+  public static final boolean DEFAULT_USE_SIMPLE_PARTITIONER = false;
														
 
															+  public static final int DEFAULT_OUTPUT_REPLICATION = 1;
														
 
															+}
														
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/terasort/TestTeraSort.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/terasort/TestTeraSort.java
@@ -104,4 +104,9 @@ public class TestTeraSort extends HadoopTestCase {
 
															       TERA_OUTPUT_PATH);
														
 
															   }
														
 
															+  public void testTeraSortWithLessThanTwoArgs() throws Exception {
														
 
															+    String[] args = new String[1];
														
 
															+    assertEquals(new TeraSort().run(args), 2);
														
 
															+  }
														
 
															+
														
 
															 }