14 years ago · 1c01951264
--- a/hadoop-mapreduce-project/CHANGES.txt
+++ b/hadoop-mapreduce-project/CHANGES.txt
@@ -2,6 +2,13 @@ Hadoop MapReduce Change Log
 
				 
			
 
				 Trunk (unreleased changes)
			
 
				 
			
 
				+  INCOMPATIBLE CHANGES
			
 
				+
			
 
				+  NEW FEATURES
			
 
				+
			
 
				+    MAPREDUCE-2669. Add new examples for Mean, Median, and Standard Deviation.
			
 
				+    (Plamen Jeliazkov via shv)
			
 
				+
			
 
				   IMPROVEMENTS
			
 
				 
			
 
				     MAPREDUCE-2887 due to HADOOP-7524 Change RPC to allow multiple protocols
			
--- a/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMean.java
+++ b/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMean.java
@@ -0,0 +1,196 @@
 
				+package org.apache.hadoop.examples;

			
 
				+

			
 
				+/**

			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one

			
 
				+ * or more contributor license agreements.  See the NOTICE file

			
 
				+ * distributed with this work for additional information

			
 
				+ * regarding copyright ownership.  The ASF licenses this file

			
 
				+ * to you under the Apache License, Version 2.0 (the

			
 
				+ * "License"); you may not use this file except in compliance

			
 
				+ * with the License.  You may obtain a copy of the License at

			
 
				+ *

			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0

			
 
				+ *

			
 
				+ * Unless required by applicable law or agreed to in writing, software

			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,

			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

			
 
				+ * See the License for the specific language governing permissions and

			
 
				+ * limitations under the License.

			
 
				+ */

			
 
				+

			
 
				+import java.io.BufferedReader;

			
 
				+import java.io.IOException;

			
 
				+import java.io.InputStreamReader;

			
 
				+import java.util.StringTokenizer;

			
 
				+

			
 
				+import org.apache.hadoop.conf.Configuration;

			
 
				+import org.apache.hadoop.conf.Configured;

			
 
				+import org.apache.hadoop.fs.FileSystem;

			
 
				+import org.apache.hadoop.fs.Path;

			
 
				+import org.apache.hadoop.io.LongWritable;

			
 
				+import org.apache.hadoop.io.Text;

			
 
				+import org.apache.hadoop.mapreduce.Job;

			
 
				+import org.apache.hadoop.mapreduce.Mapper;

			
 
				+import org.apache.hadoop.mapreduce.Reducer;

			
 
				+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

			
 
				+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

			
 
				+import org.apache.hadoop.util.Tool;

			
 
				+import org.apache.hadoop.util.ToolRunner;

			
 
				+

			
 
				+public class WordMean extends Configured implements Tool {

			
 
				+

			
 
				+  private double mean = 0;

			
 
				+

			
 
				+  private final static Text COUNT = new Text("count");

			
 
				+  private final static Text LENGTH = new Text("length");

			
 
				+  private final static LongWritable ONE = new LongWritable(1);

			
 
				+

			
 
				+  /**

			
 
				+   * Maps words from line of text into 2 key-value pairs; one key-value pair for

			
 
				+   * counting the word, another for counting its length.

			
 
				+   */

			
 
				+  public static class WordMeanMapper extends

			
 
				+      Mapper<Object, Text, Text, LongWritable> {

			
 
				+

			
 
				+    private LongWritable wordLen = new LongWritable();

			
 
				+

			
 
				+    /**

			
 
				+     * Emits 2 key-value pairs for counting the word and its length. Outputs are

			
 
				+     * (Text, LongWritable).

			
 
				+     * 

			
 
				+     * @param value

			
 
				+     *          This will be a line of text coming in from our input file.

			
 
				+     */

			
 
				+    public void map(Object key, Text value, Context context)

			
 
				+        throws IOException, InterruptedException {

			
 
				+      StringTokenizer itr = new StringTokenizer(value.toString());

			
 
				+      while (itr.hasMoreTokens()) {

			
 
				+        String string = itr.nextToken();

			
 
				+        this.wordLen.set(string.length());

			
 
				+        context.write(LENGTH, this.wordLen);

			
 
				+        context.write(COUNT, ONE);

			
 
				+      }

			
 
				+    }

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * Performs integer summation of all the values for each key.

			
 
				+   */

			
 
				+  public static class WordMeanReducer extends

			
 
				+      Reducer<Text, LongWritable, Text, LongWritable> {

			
 
				+

			
 
				+    private LongWritable sum = new LongWritable();

			
 
				+

			
 
				+    /**

			
 
				+     * Sums all the individual values within the iterator and writes them to the

			
 
				+     * same key.

			
 
				+     * 

			
 
				+     * @param key

			
 
				+     *          This will be one of 2 constants: LENGTH_STR or COUNT_STR.

			
 
				+     * @param values

			
 
				+     *          This will be an iterator of all the values associated with that

			
 
				+     *          key.

			
 
				+     */

			
 
				+    public void reduce(Text key, Iterable<LongWritable> values, Context context)

			
 
				+        throws IOException, InterruptedException {

			
 
				+

			
 
				+      int theSum = 0;

			
 
				+      for (LongWritable val : values) {

			
 
				+        theSum += val.get();

			
 
				+      }

			
 
				+      sum.set(theSum);

			
 
				+      context.write(key, sum);

			
 
				+    }

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * Reads the output file and parses the summation of lengths, and the word

			
 
				+   * count, to perform a quick calculation of the mean.

			
 
				+   * 

			
 
				+   * @param path

			
 
				+   *          The path to find the output file in. Set in main to the output

			
 
				+   *          directory.

			
 
				+   * @throws IOException

			
 
				+   *           If it cannot access the output directory, we throw an exception.

			
 
				+   */

			
 
				+  private double readAndCalcMean(Path path, Configuration conf)

			
 
				+      throws IOException {

			
 
				+    FileSystem fs = FileSystem.get(conf);

			
 
				+    Path file = new Path(path, "part-r-00000");

			
 
				+

			
 
				+    if (!fs.exists(file))

			
 
				+      throw new IOException("Output not found!");

			
 
				+

			
 
				+    BufferedReader br = null;

			
 
				+

			
 
				+    // average = total sum / number of elements;

			
 
				+    try {

			
 
				+      br = new BufferedReader(new InputStreamReader(fs.open(file)));

			
 
				+

			
 
				+      long count = 0;

			
 
				+      long length = 0;

			
 
				+

			
 
				+      String line;

			
 
				+      while ((line = br.readLine()) != null) {

			
 
				+        StringTokenizer st = new StringTokenizer(line);

			
 
				+

			
 
				+        // grab type

			
 
				+        String type = st.nextToken();

			
 
				+

			
 
				+        // differentiate

			
 
				+        if (type.equals(COUNT.toString())) {

			
 
				+          String countLit = st.nextToken();

			
 
				+          count = Long.parseLong(countLit);

			
 
				+        } else if (type.equals(LENGTH.toString())) {

			
 
				+          String lengthLit = st.nextToken();

			
 
				+          length = Long.parseLong(lengthLit);

			
 
				+        }

			
 
				+      }

			
 
				+

			
 
				+      double theMean = (((double) length) / ((double) count));

			
 
				+      System.out.println("The mean is: " + theMean);

			
 
				+      return theMean;

			
 
				+    } finally {

			
 
				+      br.close();

			
 
				+    }

			
 
				+  }

			
 
				+

			
 
				+  public static void main(String[] args) throws Exception {

			
 
				+    ToolRunner.run(new Configuration(), new WordMean(), args);

			
 
				+  }

			
 
				+

			
 
				+  @Override

			
 
				+  public int run(String[] args) throws Exception {

			
 
				+    if (args.length != 2) {

			
 
				+      System.err.println("Usage: wordmean <in> <out>");

			
 
				+      return 0;

			
 
				+    }

			
 
				+

			
 
				+    Configuration conf = getConf();

			
 
				+

			
 
				+    @SuppressWarnings("deprecation")

			
 
				+    Job job = new Job(conf, "word mean");

			
 
				+    job.setJarByClass(WordMean.class);

			
 
				+    job.setMapperClass(WordMeanMapper.class);

			
 
				+    job.setCombinerClass(WordMeanReducer.class);

			
 
				+    job.setReducerClass(WordMeanReducer.class);

			
 
				+    job.setOutputKeyClass(Text.class);

			
 
				+    job.setOutputValueClass(LongWritable.class);

			
 
				+    FileInputFormat.addInputPath(job, new Path(args[0]));

			
 
				+    Path outputpath = new Path(args[1]);

			
 
				+    FileOutputFormat.setOutputPath(job, outputpath);

			
 
				+    boolean result = job.waitForCompletion(true);

			
 
				+    mean = readAndCalcMean(outputpath, conf);

			
 
				+

			
 
				+    return (result ? 0 : 1);

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * Only valuable after run() called.

			
 
				+   * 

			
 
				+   * @return Returns the mean value.

			
 
				+   */

			
 
				+  public double getMean() {

			
 
				+    return mean;

			
 
				+  }

			
 
				+}
			
--- a/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMedian.java
+++ b/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMedian.java
@@ -0,0 +1,208 @@
 
				+package org.apache.hadoop.examples;

			
 
				+

			
 
				+/**

			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one

			
 
				+ * or more contributor license agreements.  See the NOTICE file

			
 
				+ * distributed with this work for additional information

			
 
				+ * regarding copyright ownership.  The ASF licenses this file

			
 
				+ * to you under the Apache License, Version 2.0 (the

			
 
				+ * "License"); you may not use this file except in compliance

			
 
				+ * with the License.  You may obtain a copy of the License at

			
 
				+ *

			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0

			
 
				+ *

			
 
				+ * Unless required by applicable law or agreed to in writing, software

			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,

			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

			
 
				+ * See the License for the specific language governing permissions and

			
 
				+ * limitations under the License.

			
 
				+ */

			
 
				+

			
 
				+import java.io.BufferedReader;

			
 
				+import java.io.IOException;

			
 
				+import java.io.InputStreamReader;

			
 
				+import java.util.StringTokenizer;

			
 
				+

			
 
				+import org.apache.hadoop.conf.Configuration;

			
 
				+import org.apache.hadoop.conf.Configured;

			
 
				+import org.apache.hadoop.fs.FileSystem;

			
 
				+import org.apache.hadoop.fs.Path;

			
 
				+import org.apache.hadoop.io.IntWritable;

			
 
				+import org.apache.hadoop.io.Text;

			
 
				+import org.apache.hadoop.mapreduce.Job;

			
 
				+import org.apache.hadoop.mapreduce.Mapper;

			
 
				+import org.apache.hadoop.mapreduce.Reducer;

			
 
				+import org.apache.hadoop.mapreduce.TaskCounter;

			
 
				+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

			
 
				+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

			
 
				+import org.apache.hadoop.util.Tool;

			
 
				+import org.apache.hadoop.util.ToolRunner;

			
 
				+

			
 
				+public class WordMedian extends Configured implements Tool {

			
 
				+

			
 
				+  private double median = 0;

			
 
				+  private final static IntWritable ONE = new IntWritable(1);

			
 
				+

			
 
				+  /**

			
 
				+   * Maps words from line of text into a key-value pair; the length of the word

			
 
				+   * as the key, and 1 as the value.

			
 
				+   */

			
 
				+  public static class WordMedianMapper extends

			
 
				+      Mapper<Object, Text, IntWritable, IntWritable> {

			
 
				+

			
 
				+    private IntWritable length = new IntWritable();

			
 
				+

			
 
				+    /**

			
 
				+     * Emits a key-value pair for counting the word. Outputs are (IntWritable,

			
 
				+     * IntWritable).

			
 
				+     * 

			
 
				+     * @param value

			
 
				+     *          This will be a line of text coming in from our input file.

			
 
				+     */

			
 
				+    public void map(Object key, Text value, Context context)

			
 
				+        throws IOException, InterruptedException {

			
 
				+      StringTokenizer itr = new StringTokenizer(value.toString());

			
 
				+      while (itr.hasMoreTokens()) {

			
 
				+        String string = itr.nextToken();

			
 
				+        length.set(string.length());

			
 
				+        context.write(length, ONE);

			
 
				+      }

			
 
				+    }

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * Performs integer summation of all the values for each key.

			
 
				+   */

			
 
				+  public static class WordMedianReducer extends

			
 
				+      Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {

			
 
				+

			
 
				+    private IntWritable val = new IntWritable();

			
 
				+

			
 
				+    /**

			
 
				+     * Sums all the individual values within the iterator and writes them to the

			
 
				+     * same key.

			
 
				+     * 

			
 
				+     * @param key

			
 
				+     *          This will be a length of a word that was read.

			
 
				+     * @param values

			
 
				+     *          This will be an iterator of all the values associated with that

			
 
				+     *          key.

			
 
				+     */

			
 
				+    public void reduce(IntWritable key, Iterable<IntWritable> values,

			
 
				+        Context context) throws IOException, InterruptedException {

			
 
				+

			
 
				+      int sum = 0;

			
 
				+      for (IntWritable value : values) {

			
 
				+        sum += value.get();

			
 
				+      }

			
 
				+      val.set(sum);

			
 
				+      context.write(key, val);

			
 
				+    }

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * This is a standard program to read and find a median value based on a file

			
 
				+   * of word counts such as: 1 456, 2 132, 3 56... Where the first values are

			
 
				+   * the word lengths and the following values are the number of times that

			
 
				+   * words of that length appear.

			
 
				+   * 

			
 
				+   * @param path

			
 
				+   *          The path to read the HDFS file from (part-r-00000...00001...etc).

			
 
				+   * @param medianIndex1

			
 
				+   *          The first length value to look for.

			
 
				+   * @param medianIndex2

			
 
				+   *          The second length value to look for (will be the same as the first

			
 
				+   *          if there are an even number of words total).

			
 
				+   * @throws IOException

			
 
				+   *           If file cannot be found, we throw an exception.

			
 
				+   * */

			
 
				+  private double readAndFindMedian(String path, int medianIndex1,

			
 
				+      int medianIndex2, Configuration conf) throws IOException {

			
 
				+    FileSystem fs = FileSystem.get(conf);

			
 
				+    Path file = new Path(path, "part-r-00000");

			
 
				+

			
 
				+    if (!fs.exists(file))

			
 
				+      throw new IOException("Output not found!");

			
 
				+

			
 
				+    BufferedReader br = null;

			
 
				+

			
 
				+    try {

			
 
				+      br = new BufferedReader(new InputStreamReader(fs.open(file)));

			
 
				+      int num = 0;

			
 
				+

			
 
				+      String line;

			
 
				+      while ((line = br.readLine()) != null) {

			
 
				+        StringTokenizer st = new StringTokenizer(line);

			
 
				+

			
 
				+        // grab length

			
 
				+        String currLen = st.nextToken();

			
 
				+

			
 
				+        // grab count

			
 
				+        String lengthFreq = st.nextToken();

			
 
				+

			
 
				+        int prevNum = num;

			
 
				+        num += Integer.parseInt(lengthFreq);

			
 
				+

			
 
				+        if (medianIndex2 >= prevNum && medianIndex1 <= num) {

			
 
				+          System.out.println("The median is: " + currLen);

			
 
				+          br.close();

			
 
				+          return Double.parseDouble(currLen);

			
 
				+        } else if (medianIndex2 >= prevNum && medianIndex1 < num) {

			
 
				+          String nextCurrLen = st.nextToken();

			
 
				+          double theMedian = (Integer.parseInt(currLen) + Integer

			
 
				+              .parseInt(nextCurrLen)) / 2.0;

			
 
				+          System.out.println("The median is: " + theMedian);

			
 
				+          br.close();

			
 
				+          return theMedian;

			
 
				+        }

			
 
				+      }

			
 
				+    } finally {

			
 
				+      br.close();

			
 
				+    }

			
 
				+    // error, no median found

			
 
				+    return -1;

			
 
				+  }

			
 
				+

			
 
				+  public static void main(String[] args) throws Exception {

			
 
				+    ToolRunner.run(new Configuration(), new WordMedian(), args);

			
 
				+  }

			
 
				+

			
 
				+  @Override

			
 
				+  public int run(String[] args) throws Exception {

			
 
				+    if (args.length != 2) {

			
 
				+      System.err.println("Usage: wordmedian <in> <out>");

			
 
				+      return 0;

			
 
				+    }

			
 
				+

			
 
				+    setConf(new Configuration());

			
 
				+    Configuration conf = getConf();

			
 
				+

			
 
				+    @SuppressWarnings("deprecation")

			
 
				+    Job job = new Job(conf, "word median");

			
 
				+    job.setJarByClass(WordMedian.class);

			
 
				+    job.setMapperClass(WordMedianMapper.class);

			
 
				+    job.setCombinerClass(WordMedianReducer.class);

			
 
				+    job.setReducerClass(WordMedianReducer.class);

			
 
				+    job.setOutputKeyClass(IntWritable.class);

			
 
				+    job.setOutputValueClass(IntWritable.class);

			
 
				+    FileInputFormat.addInputPath(job, new Path(args[0]));

			
 
				+    FileOutputFormat.setOutputPath(job, new Path(args[1]));

			
 
				+    boolean result = job.waitForCompletion(true);

			
 
				+

			
 
				+    // Wait for JOB 1 -- get middle value to check for Median

			
 
				+

			
 
				+    long totalWords = job.getCounters()

			
 
				+        .getGroup(TaskCounter.class.getCanonicalName())

			
 
				+        .findCounter("MAP_OUTPUT_RECORDS", "Map output records").getValue();

			
 
				+    int medianIndex1 = (int) Math.ceil((totalWords / 2.0));

			
 
				+    int medianIndex2 = (int) Math.floor((totalWords / 2.0));

			
 
				+

			
 
				+    median = readAndFindMedian(args[1], medianIndex1, medianIndex2, conf);

			
 
				+

			
 
				+    return (result ? 0 : 1);

			
 
				+  }

			
 
				+

			
 
				+  public double getMedian() {

			
 
				+    return median;

			
 
				+  }

			
 
				+}

			
--- a/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordStandardDeviation.java
+++ b/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordStandardDeviation.java
@@ -0,0 +1,210 @@
 
				+package org.apache.hadoop.examples;

			
 
				+

			
 
				+/**

			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one

			
 
				+ * or more contributor license agreements.  See the NOTICE file

			
 
				+ * distributed with this work for additional information

			
 
				+ * regarding copyright ownership.  The ASF licenses this file

			
 
				+ * to you under the Apache License, Version 2.0 (the

			
 
				+ * "License"); you may not use this file except in compliance

			
 
				+ * with the License.  You may obtain a copy of the License at

			
 
				+ *

			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0

			
 
				+ *

			
 
				+ * Unless required by applicable law or agreed to in writing, software

			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,

			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

			
 
				+ * See the License for the specific language governing permissions and

			
 
				+ * limitations under the License.

			
 
				+ */

			
 
				+

			
 
				+import java.io.BufferedReader;

			
 
				+import java.io.IOException;

			
 
				+import java.io.InputStreamReader;

			
 
				+import java.util.StringTokenizer;

			
 
				+

			
 
				+import org.apache.hadoop.conf.Configuration;

			
 
				+import org.apache.hadoop.conf.Configured;

			
 
				+import org.apache.hadoop.fs.FileSystem;

			
 
				+import org.apache.hadoop.fs.Path;

			
 
				+import org.apache.hadoop.io.LongWritable;

			
 
				+import org.apache.hadoop.io.Text;

			
 
				+import org.apache.hadoop.mapreduce.Job;

			
 
				+import org.apache.hadoop.mapreduce.Mapper;

			
 
				+import org.apache.hadoop.mapreduce.Reducer;

			
 
				+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

			
 
				+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

			
 
				+import org.apache.hadoop.util.Tool;

			
 
				+import org.apache.hadoop.util.ToolRunner;

			
 
				+

			
 
				+public class WordStandardDeviation extends Configured implements Tool {

			
 
				+

			
 
				+  private double stddev = 0;

			
 
				+

			
 
				+  private final static Text LENGTH = new Text("length");

			
 
				+  private final static Text SQUARE = new Text("square");

			
 
				+  private final static Text COUNT = new Text("count");

			
 
				+  private final static LongWritable ONE = new LongWritable(1);

			
 
				+

			
 
				+  /**

			
 
				+   * Maps words from line of text into 3 key-value pairs; one key-value pair for

			
 
				+   * counting the word, one for counting its length, and one for counting the

			
 
				+   * square of its length.

			
 
				+   */

			
 
				+  public static class WordStandardDeviationMapper extends

			
 
				+      Mapper<Object, Text, Text, LongWritable> {

			
 
				+

			
 
				+    private LongWritable wordLen = new LongWritable();

			
 
				+    private LongWritable wordLenSq = new LongWritable();

			
 
				+

			
 
				+    /**

			
 
				+     * Emits 3 key-value pairs for counting the word, its length, and the

			
 
				+     * squares of its length. Outputs are (Text, LongWritable).

			
 
				+     * 

			
 
				+     * @param value

			
 
				+     *          This will be a line of text coming in from our input file.

			
 
				+     */

			
 
				+    public void map(Object key, Text value, Context context)

			
 
				+        throws IOException, InterruptedException {

			
 
				+      StringTokenizer itr = new StringTokenizer(value.toString());

			
 
				+      while (itr.hasMoreTokens()) {

			
 
				+        String string = itr.nextToken();

			
 
				+

			
 
				+        this.wordLen.set(string.length());

			
 
				+

			
 
				+        // the square of an integer is an integer...

			
 
				+        this.wordLenSq.set((long) Math.pow(string.length(), 2.0));

			
 
				+

			
 
				+        context.write(LENGTH, this.wordLen);

			
 
				+        context.write(SQUARE, this.wordLenSq);

			
 
				+        context.write(COUNT, ONE);

			
 
				+      }

			
 
				+    }

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * Performs integer summation of all the values for each key.

			
 
				+   */

			
 
				+  public static class WordStandardDeviationReducer extends

			
 
				+      Reducer<Text, LongWritable, Text, LongWritable> {

			
 
				+

			
 
				+    private LongWritable val = new LongWritable();

			
 
				+

			
 
				+    /**

			
 
				+     * Sums all the individual values within the iterator and writes them to the

			
 
				+     * same key.

			
 
				+     * 

			
 
				+     * @param key

			
 
				+     *          This will be one of 2 constants: LENGTH_STR, COUNT_STR, or

			
 
				+     *          SQUARE_STR.

			
 
				+     * @param values

			
 
				+     *          This will be an iterator of all the values associated with that

			
 
				+     *          key.

			
 
				+     */

			
 
				+    public void reduce(Text key, Iterable<LongWritable> values, Context context)

			
 
				+        throws IOException, InterruptedException {

			
 
				+

			
 
				+      int sum = 0;

			
 
				+      for (LongWritable value : values) {

			
 
				+        sum += value.get();

			
 
				+      }

			
 
				+      val.set(sum);

			
 
				+      context.write(key, val);

			
 
				+    }

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * Reads the output file and parses the summation of lengths, the word count,

			
 
				+   * and the lengths squared, to perform a quick calculation of the standard

			
 
				+   * deviation.

			
 
				+   * 

			
 
				+   * @param path

			
 
				+   *          The path to find the output file in. Set in main to the output

			
 
				+   *          directory.

			
 
				+   * @throws IOException

			
 
				+   *           If it cannot access the output directory, we throw an exception.

			
 
				+   */

			
 
				+  private double readAndCalcStdDev(Path path, Configuration conf)

			
 
				+      throws IOException {

			
 
				+    FileSystem fs = FileSystem.get(conf);

			
 
				+    Path file = new Path(path, "part-r-00000");

			
 
				+

			
 
				+    if (!fs.exists(file))

			
 
				+      throw new IOException("Output not found!");

			
 
				+

			
 
				+    double stddev = 0;

			
 
				+    BufferedReader br = null;

			
 
				+    try {

			
 
				+      br = new BufferedReader(new InputStreamReader(fs.open(file)));

			
 
				+      long count = 0;

			
 
				+      long length = 0;

			
 
				+      long square = 0;

			
 
				+      String line;

			
 
				+      while ((line = br.readLine()) != null) {

			
 
				+        StringTokenizer st = new StringTokenizer(line);

			
 
				+

			
 
				+        // grab type

			
 
				+        String type = st.nextToken();

			
 
				+

			
 
				+        // differentiate

			
 
				+        if (type.equals(COUNT.toString())) {

			
 
				+          String countLit = st.nextToken();

			
 
				+          count = Long.parseLong(countLit);

			
 
				+        } else if (type.equals(LENGTH.toString())) {

			
 
				+          String lengthLit = st.nextToken();

			
 
				+          length = Long.parseLong(lengthLit);

			
 
				+        } else if (type.equals(SQUARE.toString())) {

			
 
				+          String squareLit = st.nextToken();

			
 
				+          square = Long.parseLong(squareLit);

			
 
				+        }

			
 
				+      }

			
 
				+      // average = total sum / number of elements;

			
 
				+      double mean = (((double) length) / ((double) count));

			
 
				+      // standard deviation = sqrt((sum(lengths ^ 2)/count) - (mean ^ 2))

			
 
				+      mean = Math.pow(mean, 2.0);

			
 
				+      double term = (((double) square / ((double) count)));

			
 
				+      stddev = Math.sqrt((term - mean));

			
 
				+      System.out.println("The standard deviation is: " + stddev);

			
 
				+    } finally {

			
 
				+      br.close();

			
 
				+    }

			
 
				+    return stddev;

			
 
				+  }

			
 
				+

			
 
				+  public static void main(String[] args) throws Exception {

			
 
				+    ToolRunner.run(new Configuration(), new WordStandardDeviation(),

			
 
				+        args);

			
 
				+  }

			
 
				+

			
 
				+  @Override

			
 
				+  public int run(String[] args) throws Exception {

			
 
				+    if (args.length != 2) {

			
 
				+      System.err.println("Usage: wordstddev <in> <out>");

			
 
				+      return 0;

			
 
				+    }

			
 
				+

			
 
				+    Configuration conf = getConf();

			
 
				+

			
 
				+    @SuppressWarnings("deprecation")

			
 
				+    Job job = new Job(conf, "word stddev");

			
 
				+    job.setJarByClass(WordStandardDeviation.class);

			
 
				+    job.setMapperClass(WordStandardDeviationMapper.class);

			
 
				+    job.setCombinerClass(WordStandardDeviationReducer.class);

			
 
				+    job.setReducerClass(WordStandardDeviationReducer.class);

			
 
				+    job.setOutputKeyClass(Text.class);

			
 
				+    job.setOutputValueClass(LongWritable.class);

			
 
				+    FileInputFormat.addInputPath(job, new Path(args[0]));

			
 
				+    Path outputpath = new Path(args[1]);

			
 
				+    FileOutputFormat.setOutputPath(job, outputpath);

			
 
				+    boolean result = job.waitForCompletion(true);

			
 
				+

			
 
				+    // read output and calculate standard deviation

			
 
				+    stddev = readAndCalcStdDev(outputpath, conf);

			
 
				+

			
 
				+    return (result ? 0 : 1);

			
 
				+  }

			
 
				+

			
 
				+  public double getStandardDeviation() {

			
 
				+    return stddev;

			
 
				+  }

			
 
				+}
			
--- a/hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/examples/TestWordStats.java
+++ b/hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/examples/TestWordStats.java
@@ -0,0 +1,272 @@
 
				+package org.apache.hadoop.examples;

			
 
				+

			
 
				+import static org.junit.Assert.assertEquals;

			
 
				+

			
 
				+import java.io.BufferedReader;

			
 
				+import java.io.File;

			
 
				+import java.io.IOException;

			
 
				+import java.io.InputStreamReader;

			
 
				+import java.util.StringTokenizer;

			
 
				+import java.util.TreeMap;

			
 
				+

			
 
				+import org.apache.hadoop.conf.Configuration;

			
 
				+import org.apache.hadoop.fs.FileStatus;

			
 
				+import org.apache.hadoop.fs.FileSystem;

			
 
				+import org.apache.hadoop.fs.Path;

			
 
				+import org.apache.hadoop.util.ToolRunner;

			
 
				+import org.junit.Before;

			
 
				+import org.junit.Test;

			
 
				+

			
 
				+public class TestWordStats {

			
 
				+

			
 
				+  private final static String INPUT = "src/test/mapred/org/apache/hadoop/examples/pi/math";

			
 
				+  private final static String MEAN_OUTPUT = "build/data/mean_output";

			
 
				+  private final static String MEDIAN_OUTPUT = "build/data/median_output";

			
 
				+  private final static String STDDEV_OUTPUT = "build/data/stddev_output";

			
 
				+

			
 
				+  /**

			
 
				+   * Modified internal test class that is designed to read all the files in the

			
 
				+   * input directory, and find the standard deviation between all of the word

			
 
				+   * lengths.

			
 
				+   */

			
 
				+  public static class WordStdDevReader {

			
 
				+    private long wordsRead = 0;

			
 
				+    private long wordLengthsRead = 0;

			
 
				+    private long wordLengthsReadSquared = 0;

			
 
				+

			
 
				+    public WordStdDevReader() {

			
 
				+    }

			
 
				+

			
 
				+    public double read(String path) throws IOException {

			
 
				+      FileSystem fs = FileSystem.get(new Configuration());

			
 
				+      FileStatus[] files = fs.listStatus(new Path(path));

			
 
				+

			
 
				+      for (FileStatus fileStat : files) {

			
 
				+        if (!fileStat.isFile())

			
 
				+          continue;

			
 
				+

			
 
				+        BufferedReader br = null;

			
 
				+

			
 
				+        try {

			
 
				+          br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath())));

			
 
				+

			
 
				+          String line;

			
 
				+          while ((line = br.readLine()) != null) {

			
 
				+            StringTokenizer st = new StringTokenizer(line);

			
 
				+            String word;

			
 
				+            while (st.hasMoreTokens()) {

			
 
				+              word = st.nextToken();

			
 
				+              this.wordsRead++;

			
 
				+              this.wordLengthsRead += word.length();

			
 
				+              this.wordLengthsReadSquared += (long) Math.pow(word.length(), 2.0);

			
 
				+            }

			
 
				+          }

			
 
				+

			
 
				+        } catch (IOException e) {

			
 
				+          System.out.println("Output could not be read!");

			
 
				+          throw e;

			
 
				+        } finally {

			
 
				+          br.close();

			
 
				+        }

			
 
				+      }

			
 
				+

			
 
				+      double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead));

			
 
				+      mean = Math.pow(mean, 2.0);

			
 
				+      double term = (((double) this.wordLengthsReadSquared / ((double) this.wordsRead)));

			
 
				+      double stddev = Math.sqrt((term - mean));

			
 
				+      return stddev;

			
 
				+    }

			
 
				+

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * Modified internal test class that is designed to read all the files in the

			
 
				+   * input directory, and find the median length of all the words.

			
 
				+   */

			
 
				+  public static class WordMedianReader {

			
 
				+    private long wordsRead = 0;

			
 
				+    private TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();

			
 
				+

			
 
				+    public WordMedianReader() {

			
 
				+    }

			
 
				+

			
 
				+    public double read(String path) throws IOException {

			
 
				+      FileSystem fs = FileSystem.get(new Configuration());

			
 
				+      FileStatus[] files = fs.listStatus(new Path(path));

			
 
				+

			
 
				+      int num = 0;

			
 
				+

			
 
				+      for (FileStatus fileStat : files) {

			
 
				+        if (!fileStat.isFile())

			
 
				+          continue;

			
 
				+

			
 
				+        BufferedReader br = null;

			
 
				+

			
 
				+        try {

			
 
				+          br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath())));

			
 
				+

			
 
				+          String line;

			
 
				+          while ((line = br.readLine()) != null) {

			
 
				+            StringTokenizer st = new StringTokenizer(line);

			
 
				+            String word;

			
 
				+            while (st.hasMoreTokens()) {

			
 
				+              word = st.nextToken();

			
 
				+              this.wordsRead++;

			
 
				+              if (this.map.get(word.length()) == null) {

			
 
				+                this.map.put(word.length(), 1);

			
 
				+              } else {

			
 
				+                int count = this.map.get(word.length());

			
 
				+                this.map.put(word.length(), count + 1);

			
 
				+              }

			
 
				+            }

			
 
				+          }

			
 
				+        } catch (IOException e) {

			
 
				+          System.out.println("Output could not be read!");

			
 
				+          throw e;

			
 
				+        } finally {

			
 
				+          br.close();

			
 
				+        }

			
 
				+      }

			
 
				+

			
 
				+      int medianIndex1 = (int) Math.ceil((this.wordsRead / 2.0));

			
 
				+      int medianIndex2 = (int) Math.floor((this.wordsRead / 2.0));

			
 
				+

			
 
				+      for (Integer key : this.map.navigableKeySet()) {

			
 
				+        int prevNum = num;

			
 
				+        num += this.map.get(key);

			
 
				+

			
 
				+        if (medianIndex2 >= prevNum && medianIndex1 <= num) {

			
 
				+          return key;

			
 
				+        } else if (medianIndex2 >= prevNum && medianIndex1 < num) {

			
 
				+          Integer nextCurrLen = this.map.navigableKeySet().iterator().next();

			
 
				+          double median = (key + nextCurrLen) / 2.0;

			
 
				+          return median;

			
 
				+        }

			
 
				+      }

			
 
				+      return -1;

			
 
				+    }

			
 
				+

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * Modified internal test class that is designed to read all the files in the

			
 
				+   * input directory, and find the mean length of all the words.

			
 
				+   */

			
 
				+  public static class WordMeanReader {

			
 
				+    private long wordsRead = 0;

			
 
				+    private long wordLengthsRead = 0;

			
 
				+

			
 
				+    public WordMeanReader() {

			
 
				+    }

			
 
				+

			
 
				+    public double read(String path) throws IOException {

			
 
				+      FileSystem fs = FileSystem.get(new Configuration());

			
 
				+      FileStatus[] files = fs.listStatus(new Path(path));

			
 
				+

			
 
				+      for (FileStatus fileStat : files) {

			
 
				+        if (!fileStat.isFile())

			
 
				+          continue;

			
 
				+

			
 
				+        BufferedReader br = null;

			
 
				+

			
 
				+        try {

			
 
				+          br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath())));

			
 
				+

			
 
				+          String line;

			
 
				+          while ((line = br.readLine()) != null) {

			
 
				+            StringTokenizer st = new StringTokenizer(line);

			
 
				+            String word;

			
 
				+            while (st.hasMoreTokens()) {

			
 
				+              word = st.nextToken();

			
 
				+              this.wordsRead++;

			
 
				+              this.wordLengthsRead += word.length();

			
 
				+            }

			
 
				+          }

			
 
				+        } catch (IOException e) {

			
 
				+          System.out.println("Output could not be read!");

			
 
				+          throw e;

			
 
				+        } finally {

			
 
				+          br.close();

			
 
				+        }

			
 
				+      }

			
 
				+

			
 
				+      double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead));

			
 
				+      return mean;

			
 
				+    }

			
 
				+

			
 
				+  }

			
 
				+

			
 
				+  /**

			
 
				+   * Internal class designed to delete the output directory. Meant solely for

			
 
				+   * use before and after the test is run; this is so next iterations of the

			
 
				+   * test do not encounter a "file already exists" error.

			
 
				+   * 

			
 
				+   * @param dir

			
 
				+   *          The directory to delete.

			
 
				+   * @return Returns whether the deletion was successful or not.

			
 
				+   */

			
 
				+  public static boolean deleteDir(File dir) {

			
 
				+    if (dir.isDirectory()) {

			
 
				+      String[] children = dir.list();

			
 
				+      for (int i = 0; i < children.length; i++) {

			
 
				+        boolean success = deleteDir(new File(dir, children[i]));

			
 
				+        if (!success) {

			
 
				+          System.out.println("Could not delete directory after test!");

			
 
				+          return false;

			
 
				+        }

			
 
				+      }

			
 
				+    }

			
 
				+

			
 
				+    // The directory is now empty so delete it

			
 
				+    return dir.delete();

			
 
				+  }

			
 
				+

			
 
				+  @Before public void setup() throws Exception {

			
 
				+    deleteDir(new File(MEAN_OUTPUT));

			
 
				+    deleteDir(new File(MEDIAN_OUTPUT));

			
 
				+    deleteDir(new File(STDDEV_OUTPUT));

			
 
				+  }

			
 
				+

			
 
				+  @Test public void testGetTheMean() throws Exception {

			
 
				+    String args[] = new String[2];

			
 
				+    args[0] = INPUT;

			
 
				+    args[1] = MEAN_OUTPUT;

			
 
				+

			
 
				+    WordMean wm = new WordMean();

			
 
				+    ToolRunner.run(new Configuration(), wm, args);

			
 
				+    double mean = wm.getMean();

			
 
				+

			
 
				+    // outputs MUST match

			
 
				+    WordMeanReader wr = new WordMeanReader();

			
 
				+    assertEquals(mean, wr.read(INPUT), 0.0);

			
 
				+  }

			
 
				+

			
 
				+  @Test public void testGetTheMedian() throws Exception {

			
 
				+    String args[] = new String[2];

			
 
				+    args[0] = INPUT;

			
 
				+    args[1] = MEDIAN_OUTPUT;

			
 
				+

			
 
				+    WordMedian wm = new WordMedian();

			
 
				+    ToolRunner.run(new Configuration(), wm, args);

			
 
				+    double median = wm.getMedian();

			
 
				+

			
 
				+    // outputs MUST match

			
 
				+    WordMedianReader wr = new WordMedianReader();

			
 
				+    assertEquals(median, wr.read(INPUT), 0.0);

			
 
				+  }

			
 
				+

			
 
				+  @Test public void testGetTheStandardDeviation() throws Exception {

			
 
				+    String args[] = new String[2];

			
 
				+    args[0] = INPUT;

			
 
				+    args[1] = STDDEV_OUTPUT;

			
 
				+

			
 
				+    WordStandardDeviation wsd = new WordStandardDeviation();

			
 
				+    ToolRunner.run(new Configuration(), wsd, args);

			
 
				+    double stddev = wsd.getStandardDeviation();

			
 
				+

			
 
				+    // outputs MUST match

			
 
				+    WordStdDevReader wr = new WordStdDevReader();

			
 
				+    assertEquals(stddev, wr.read(INPUT), 0.0);

			
 
				+  }

			
 
				+

			
 
				+}