|
@@ -135,6 +135,12 @@
|
|
<p><code>WordCount</code> is a simple application that counts the number of
|
|
<p><code>WordCount</code> is a simple application that counts the number of
|
|
occurences of each word in a given input set.</p>
|
|
occurences of each word in a given input set.</p>
|
|
|
|
|
|
|
|
+ <p>This works with a
|
|
|
|
+ <a href="quickstart.html#Standalone+Operation">local-standalone</a>,
|
|
|
|
+ <a href="quickstart.html#SingleNodeSetup">pseudo-distributed</a> or
|
|
|
|
+ <a href="quickstart.html#Fully-Distributed+Operation">fully-distributed</a>
|
|
|
|
+ Hadoop installation.</p>
|
|
|
|
+
|
|
<section>
|
|
<section>
|
|
<title>Source Code</title>
|
|
<title>Source Code</title>
|
|
|
|
|
|
@@ -156,7 +162,7 @@
|
|
<tr>
|
|
<tr>
|
|
<td>3.</td>
|
|
<td>3.</td>
|
|
<td>
|
|
<td>
|
|
- <code>import java.io.Exception;</code>
|
|
|
|
|
|
+ <code>import java.io.IOException;</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
@@ -218,7 +224,7 @@
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
- public static class MapClass extends MapReduceBase
|
|
|
|
|
|
+ public static class Map extends MapReduceBase
|
|
implements Mapper<LongWritable, Text, Text, IntWritable> {
|
|
implements Mapper<LongWritable, Text, Text, IntWritable> {
|
|
</code>
|
|
</code>
|
|
</td>
|
|
</td>
|
|
@@ -439,7 +445,7 @@
|
|
<td>45.</td>
|
|
<td>45.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
- <code>conf.setMapperClass(MapClass.class);</code>
|
|
|
|
|
|
+ <code>conf.setMapperClass(Map.class);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
@@ -482,14 +488,14 @@
|
|
<td>52.</td>
|
|
<td>52.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
- <code>conf.setInputPath(new Path(args[1]));</code>
|
|
|
|
|
|
+ <code>conf.setInputPath(new Path(args[0]));</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
<td>53.</td>
|
|
<td>53.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
- <code>conf.setOutputPath(new Path(args[2]));</code>
|
|
|
|
|
|
+ <code>conf.setOutputPath(new Path(args[1]));</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
@@ -530,11 +536,12 @@
|
|
<code>HADOOP_VERSION</code> is the Hadoop version installed, compile
|
|
<code>HADOOP_VERSION</code> is the Hadoop version installed, compile
|
|
<code>WordCount.java</code> and create a jar:</p>
|
|
<code>WordCount.java</code> and create a jar:</p>
|
|
<p>
|
|
<p>
|
|
|
|
+ <code>$ mkdir wordcount_classes</code><br/>
|
|
<code>
|
|
<code>
|
|
$ javac -classpath ${HADOOP_HOME}/hadoop-${HADOOP_VERSION}-core.jar
|
|
$ javac -classpath ${HADOOP_HOME}/hadoop-${HADOOP_VERSION}-core.jar
|
|
- WordCount.java
|
|
|
|
|
|
+ -d wordcount_classes WordCount.java
|
|
</code><br/>
|
|
</code><br/>
|
|
- <code>$ jar -cvf /usr/joe/wordcount.jar WordCount.class</code>
|
|
|
|
|
|
+ <code>$ jar -cvf /usr/joe/wordcount.jar -C wordcount_classes/ .</code>
|
|
</p>
|
|
</p>
|
|
|
|
|
|
<p>Assuming that:</p>
|
|
<p>Assuming that:</p>
|
|
@@ -1343,7 +1350,13 @@
|
|
<title>Example: WordCount v2.0</title>
|
|
<title>Example: WordCount v2.0</title>
|
|
|
|
|
|
<p>Here is a more complete <code>WordCount</code> which uses many of the
|
|
<p>Here is a more complete <code>WordCount</code> which uses many of the
|
|
- features provided by the Map-Reduce framework we discussed so far:</p>
|
|
|
|
|
|
+ features provided by the Map-Reduce framework we discussed so far.</p>
|
|
|
|
+
|
|
|
|
+ <p>This needs the HDFS to be up and running, especially for the
|
|
|
|
+ <code>DistributedCache</code>-related features. Hence it only works with a
|
|
|
|
+ <a href="quickstart.html#SingleNodeSetup">pseudo-distributed</a> or
|
|
|
|
+ <a href="quickstart.html#Fully-Distributed+Operation">fully-distributed</a>
|
|
|
|
+ Hadoop installation.</p>
|
|
|
|
|
|
<section>
|
|
<section>
|
|
<title>Source Code</title>
|
|
<title>Source Code</title>
|
|
@@ -1434,7 +1447,7 @@
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
- public static class MapClass extends MapReduceBase
|
|
|
|
|
|
+ public static class Map extends MapReduceBase
|
|
implements Mapper<LongWritable, Text, Text, IntWritable> {
|
|
implements Mapper<LongWritable, Text, Text, IntWritable> {
|
|
</code>
|
|
</code>
|
|
</td>
|
|
</td>
|
|
@@ -1543,36 +1556,43 @@
|
|
<td>32.</td>
|
|
<td>32.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
- <code>Path[] patternsFiles = new Path[0];</code>
|
|
|
|
|
|
+ <code>if (job.getBoolean("wordcount.skip.patterns", false)) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
<td>33.</td>
|
|
<td>33.</td>
|
|
<td>
|
|
<td>
|
|
-
|
|
|
|
- <code>try {</code>
|
|
|
|
|
|
+
|
|
|
|
+ <code>Path[] patternsFiles = new Path[0];</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
<td>34.</td>
|
|
<td>34.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
|
|
+ <code>try {</code>
|
|
|
|
+ </td>
|
|
|
|
+ </tr>
|
|
|
|
+ <tr>
|
|
|
|
+ <td>35.</td>
|
|
|
|
+ <td>
|
|
|
|
+
|
|
<code>
|
|
<code>
|
|
patternsFiles = DistributedCache.getLocalCacheFiles(job);
|
|
patternsFiles = DistributedCache.getLocalCacheFiles(job);
|
|
</code>
|
|
</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>35.</td>
|
|
|
|
|
|
+ <td>36.</td>
|
|
<td>
|
|
<td>
|
|
-
|
|
|
|
|
|
+
|
|
<code>} catch (IOException ioe) {</code>
|
|
<code>} catch (IOException ioe) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>36.</td>
|
|
|
|
|
|
+ <td>37.</td>
|
|
<td>
|
|
<td>
|
|
-
|
|
|
|
|
|
+
|
|
<code>
|
|
<code>
|
|
System.err.println("Caught exception while getting cached files: "
|
|
System.err.println("Caught exception while getting cached files: "
|
|
+ StringUtils.stringifyException(ioe));
|
|
+ StringUtils.stringifyException(ioe));
|
|
@@ -1580,60 +1600,67 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>37.</td>
|
|
|
|
|
|
+ <td>38.</td>
|
|
<td>
|
|
<td>
|
|
-
|
|
|
|
|
|
+
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>38.</td>
|
|
|
|
|
|
+ <td>39.</td>
|
|
<td>
|
|
<td>
|
|
-
|
|
|
|
|
|
+
|
|
<code>for (Path patternsFile : patternsFiles) {</code>
|
|
<code>for (Path patternsFile : patternsFiles) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>39.</td>
|
|
|
|
|
|
+ <td>40.</td>
|
|
<td>
|
|
<td>
|
|
-
|
|
|
|
|
|
+
|
|
<code>parseSkipFile(patternsFile);</code>
|
|
<code>parseSkipFile(patternsFile);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>40.</td>
|
|
|
|
|
|
+ <td>41.</td>
|
|
|
|
+ <td>
|
|
|
|
+
|
|
|
|
+ <code>}</code>
|
|
|
|
+ </td>
|
|
|
|
+ </tr>
|
|
|
|
+ <tr>
|
|
|
|
+ <td>42.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>41.</td>
|
|
|
|
|
|
+ <td>43.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>42.</td>
|
|
|
|
|
|
+ <td>44.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>43.</td>
|
|
|
|
|
|
+ <td>45.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>private void parseSkipFile(Path patternsFile) {</code>
|
|
<code>private void parseSkipFile(Path patternsFile) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>44.</td>
|
|
|
|
|
|
+ <td>46.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>try {</code>
|
|
<code>try {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>45.</td>
|
|
|
|
|
|
+ <td>47.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -1643,42 +1670,42 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>46.</td>
|
|
|
|
|
|
+ <td>48.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>String pattern = null;</code>
|
|
<code>String pattern = null;</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>47.</td>
|
|
|
|
|
|
+ <td>49.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>while ((pattern = fis.readLine()) != null) {</code>
|
|
<code>while ((pattern = fis.readLine()) != null) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>48.</td>
|
|
|
|
|
|
+ <td>50.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>patternsToSkip.add(pattern);</code>
|
|
<code>patternsToSkip.add(pattern);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>49.</td>
|
|
|
|
|
|
+ <td>51.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>50.</td>
|
|
|
|
|
|
+ <td>52.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>} catch (IOException ioe) {</code>
|
|
<code>} catch (IOException ioe) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>51.</td>
|
|
|
|
|
|
+ <td>53.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -1690,25 +1717,25 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>52.</td>
|
|
|
|
|
|
+ <td>54.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>53.</td>
|
|
|
|
|
|
+ <td>55.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>54.</td>
|
|
|
|
|
|
+ <td>56.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>55.</td>
|
|
|
|
|
|
+ <td>57.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -1719,7 +1746,7 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>56.</td>
|
|
|
|
|
|
+ <td>58.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -1730,89 +1757,89 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>57.</td>
|
|
|
|
|
|
+ <td>59.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>58.</td>
|
|
|
|
|
|
+ <td>60.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>for (String pattern : patternsToSkip) {</code>
|
|
<code>for (String pattern : patternsToSkip) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>59.</td>
|
|
|
|
|
|
+ <td>61.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>line = line.replaceAll(pattern, "");</code>
|
|
<code>line = line.replaceAll(pattern, "");</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>60.</td>
|
|
|
|
|
|
+ <td>62.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>61.</td>
|
|
|
|
|
|
+ <td>63.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>62.</td>
|
|
|
|
|
|
+ <td>64.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>StringTokenizer tokenizer = new StringTokenizer(line);</code>
|
|
<code>StringTokenizer tokenizer = new StringTokenizer(line);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>63.</td>
|
|
|
|
|
|
+ <td>65.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>while (tokenizer.hasMoreTokens()) {</code>
|
|
<code>while (tokenizer.hasMoreTokens()) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>64.</td>
|
|
|
|
|
|
+ <td>66.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>word.set(tokenizer.nextToken());</code>
|
|
<code>word.set(tokenizer.nextToken());</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>65.</td>
|
|
|
|
|
|
+ <td>67.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>output.collect(word, one);</code>
|
|
<code>output.collect(word, one);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>66.</td>
|
|
|
|
|
|
+ <td>68.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>reporter.incrCounter(Counters.INPUT_WORDS, 1);</code>
|
|
<code>reporter.incrCounter(Counters.INPUT_WORDS, 1);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>67.</td>
|
|
|
|
|
|
+ <td>69.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>68.</td>
|
|
|
|
|
|
+ <td>70.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>69.</td>
|
|
|
|
|
|
+ <td>71.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>if ((++numRecords % 100) == 0) {</code>
|
|
<code>if ((++numRecords % 100) == 0) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>70.</td>
|
|
|
|
|
|
+ <td>72.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -1823,32 +1850,32 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>71.</td>
|
|
|
|
|
|
+ <td>73.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>72.</td>
|
|
|
|
|
|
+ <td>74.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>73.</td>
|
|
|
|
|
|
+ <td>75.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>74.</td>
|
|
|
|
|
|
+ <td>76.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>75.</td>
|
|
|
|
|
|
+ <td>77.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -1858,7 +1885,7 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>76.</td>
|
|
|
|
|
|
+ <td>78.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -1869,67 +1896,67 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>77.</td>
|
|
|
|
|
|
+ <td>79.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>int sum = 0;</code>
|
|
<code>int sum = 0;</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>78.</td>
|
|
|
|
|
|
+ <td>80.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>while (values.hasNext()) {</code>
|
|
<code>while (values.hasNext()) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>79.</td>
|
|
|
|
|
|
+ <td>81.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>sum += values.next().get();</code>
|
|
<code>sum += values.next().get();</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>80.</td>
|
|
|
|
|
|
+ <td>82.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>81.</td>
|
|
|
|
|
|
+ <td>83.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>output.collect(key, new IntWritable(sum));</code>
|
|
<code>output.collect(key, new IntWritable(sum));</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>82.</td>
|
|
|
|
|
|
+ <td>84.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>83.</td>
|
|
|
|
|
|
+ <td>85.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>84.</td>
|
|
|
|
|
|
+ <td>86.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>85.</td>
|
|
|
|
|
|
+ <td>87.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>public int run(String[] args) throws Exception {</code>
|
|
<code>public int run(String[] args) throws Exception {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>86.</td>
|
|
|
|
|
|
+ <td>88.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -1938,79 +1965,79 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>87.</td>
|
|
|
|
|
|
+ <td>89.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>conf.setJobName("wordcount");</code>
|
|
<code>conf.setJobName("wordcount");</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>88.</td>
|
|
|
|
|
|
+ <td>90.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>89.</td>
|
|
|
|
|
|
+ <td>91.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>conf.setOutputKeyClass(Text.class);</code>
|
|
<code>conf.setOutputKeyClass(Text.class);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>90.</td>
|
|
|
|
|
|
+ <td>92.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>conf.setOutputValueClass(IntWritable.class);</code>
|
|
<code>conf.setOutputValueClass(IntWritable.class);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>91.</td>
|
|
|
|
|
|
+ <td>93.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>92.</td>
|
|
|
|
|
|
+ <td>94.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
- <code>conf.setMapperClass(MapClass.class);</code>
|
|
|
|
|
|
+ <code>conf.setMapperClass(Map.class);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>93.</td>
|
|
|
|
|
|
+ <td>95.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>conf.setCombinerClass(Reduce.class);</code>
|
|
<code>conf.setCombinerClass(Reduce.class);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>94.</td>
|
|
|
|
|
|
+ <td>96.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>conf.setReducerClass(Reduce.class);</code>
|
|
<code>conf.setReducerClass(Reduce.class);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>95.</td>
|
|
|
|
|
|
+ <td>97.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>96.</td>
|
|
|
|
|
|
+ <td>98.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>conf.setInputFormat(TextInputFormat.class);</code>
|
|
<code>conf.setInputFormat(TextInputFormat.class);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>97.</td>
|
|
|
|
|
|
+ <td>99.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>conf.setOutputFormat(TextOutputFormat.class);</code>
|
|
<code>conf.setOutputFormat(TextOutputFormat.class);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>98.</td>
|
|
|
|
|
|
+ <td>100.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>99.</td>
|
|
|
|
|
|
+ <td>101.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -2019,21 +2046,21 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>100.</td>
|
|
|
|
|
|
+ <td>102.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>for (int i=0; i < args.length; ++i) {</code>
|
|
<code>for (int i=0; i < args.length; ++i) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>101.</td>
|
|
|
|
|
|
+ <td>103.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
- <code>if ("-skip".equals(args[i]) {</code>
|
|
|
|
|
|
+ <code>if ("-skip".equals(args[i])) {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>102.</td>
|
|
|
|
|
|
+ <td>104.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -2042,82 +2069,91 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>103.</td>
|
|
|
|
|
|
+ <td>105.</td>
|
|
|
|
+ <td>
|
|
|
|
+
|
|
|
|
+ <code>
|
|
|
|
+ conf.setBoolean("wordcount.skip.patterns", true);
|
|
|
|
+ </code>
|
|
|
|
+ </td>
|
|
|
|
+ </tr>
|
|
|
|
+ <tr>
|
|
|
|
+ <td>106.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>} else {</code>
|
|
<code>} else {</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>104.</td>
|
|
|
|
|
|
+ <td>107.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>other_args.add(args[i]);</code>
|
|
<code>other_args.add(args[i]);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>105.</td>
|
|
|
|
|
|
+ <td>108.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>106.</td>
|
|
|
|
|
|
+ <td>109.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>107.</td>
|
|
|
|
|
|
+ <td>110.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>108.</td>
|
|
|
|
|
|
+ <td>111.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
- <code>conf.setInputPath(new Path(other_args[0]));</code>
|
|
|
|
|
|
+ <code>conf.setInputPath(new Path(other_args.get(0)));</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>109.</td>
|
|
|
|
|
|
+ <td>112.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
- <code>conf.setOutputPath(new Path(other_args[1]));</code>
|
|
|
|
|
|
+ <code>conf.setOutputPath(new Path(other_args.get(1)));</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>110.</td>
|
|
|
|
|
|
+ <td>113.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>111.</td>
|
|
|
|
|
|
+ <td>114.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>JobClient.runJob(conf);</code>
|
|
<code>JobClient.runJob(conf);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>112.</td>
|
|
|
|
|
|
+ <td>115.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>return 0;</code>
|
|
<code>return 0;</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>113.</td>
|
|
|
|
|
|
+ <td>116.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>114.</td>
|
|
|
|
|
|
+ <td>117.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>115.</td>
|
|
|
|
|
|
+ <td>118.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -2126,7 +2162,7 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>116.</td>
|
|
|
|
|
|
+ <td>119.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>
|
|
<code>
|
|
@@ -2136,27 +2172,27 @@
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>117.</td>
|
|
|
|
|
|
+ <td>120.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>System.exit(res);</code>
|
|
<code>System.exit(res);</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>118.</td>
|
|
|
|
|
|
+ <td>121.</td>
|
|
<td>
|
|
<td>
|
|
|
|
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>119.</td>
|
|
|
|
|
|
+ <td>122.</td>
|
|
<td>
|
|
<td>
|
|
<code>}</code>
|
|
<code>}</code>
|
|
</td>
|
|
</td>
|
|
</tr>
|
|
</tr>
|
|
<tr>
|
|
<tr>
|
|
- <td>120.</td>
|
|
|
|
|
|
+ <td>123.</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
</tr>
|
|
</table>
|
|
</table>
|
|
@@ -2175,7 +2211,7 @@
|
|
<code>Hello World, Bye World!</code><br/>
|
|
<code>Hello World, Bye World!</code><br/>
|
|
<br/>
|
|
<br/>
|
|
<code>$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file02</code><br/>
|
|
<code>$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file02</code><br/>
|
|
- <code>Hello Hadoop, Goodbye the Hadoop.</code>
|
|
|
|
|
|
+ <code>Hello Hadoop, Goodbye to hadoop.</code>
|
|
</p>
|
|
</p>
|
|
|
|
|
|
<p>Run the application:</p>
|
|
<p>Run the application:</p>
|
|
@@ -2195,11 +2231,11 @@
|
|
<code>Bye 1</code><br/>
|
|
<code>Bye 1</code><br/>
|
|
<code>Goodbye 1</code><br/>
|
|
<code>Goodbye 1</code><br/>
|
|
<code>Hadoop, 1</code><br/>
|
|
<code>Hadoop, 1</code><br/>
|
|
- <code>Hadoop. 1</code><br/>
|
|
|
|
<code>Hello 2</code><br/>
|
|
<code>Hello 2</code><br/>
|
|
<code>World! 1</code><br/>
|
|
<code>World! 1</code><br/>
|
|
<code>World, 1</code><br/>
|
|
<code>World, 1</code><br/>
|
|
- <code>the 1</code><br/>
|
|
|
|
|
|
+ <code>hadoop. 1</code><br/>
|
|
|
|
+ <code>to 1</code><br/>
|
|
</p>
|
|
</p>
|
|
|
|
|
|
<p>Notice that the inputs differ from the first version we looked at,
|
|
<p>Notice that the inputs differ from the first version we looked at,
|
|
@@ -2213,7 +2249,7 @@
|
|
<code>\.</code><br/>
|
|
<code>\.</code><br/>
|
|
<code>\,</code><br/>
|
|
<code>\,</code><br/>
|
|
<code>\!</code><br/>
|
|
<code>\!</code><br/>
|
|
- <code>the</code><br/>
|
|
|
|
|
|
+ <code>to</code><br/>
|
|
</p>
|
|
</p>
|
|
|
|
|
|
<p>Run it again, this time with more options:</p>
|
|
<p>Run it again, this time with more options:</p>
|
|
@@ -2233,9 +2269,10 @@
|
|
<br/>
|
|
<br/>
|
|
<code>Bye 1</code><br/>
|
|
<code>Bye 1</code><br/>
|
|
<code>Goodbye 1</code><br/>
|
|
<code>Goodbye 1</code><br/>
|
|
- <code>Hadoop 2</code><br/>
|
|
|
|
|
|
+ <code>Hadoop 1</code><br/>
|
|
<code>Hello 2</code><br/>
|
|
<code>Hello 2</code><br/>
|
|
<code>World 2</code><br/>
|
|
<code>World 2</code><br/>
|
|
|
|
+ <code>hadoop 1</code><br/>
|
|
</p>
|
|
</p>
|
|
|
|
|
|
<p>Run it once more, this time switch-off case-sensitivity:</p>
|
|
<p>Run it once more, this time switch-off case-sensitivity:</p>
|
|
@@ -2262,7 +2299,7 @@
|
|
</section>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<section>
|
|
- <title>Salient Points</title>
|
|
|
|
|
|
+ <title>Highlights</title>
|
|
|
|
|
|
<p>The second version of <code>WordCount</code> improves upon the
|
|
<p>The second version of <code>WordCount</code> improves upon the
|
|
previous one by using some features offered by the Map-Reduce framework:
|
|
previous one by using some features offered by the Map-Reduce framework:
|
|
@@ -2271,23 +2308,23 @@
|
|
<li>
|
|
<li>
|
|
Demonstrates how applications can access configuration parameters
|
|
Demonstrates how applications can access configuration parameters
|
|
in the <code>configure</code> method of the <code>Mapper</code> (and
|
|
in the <code>configure</code> method of the <code>Mapper</code> (and
|
|
- <code>Reducer</code>) implementations (lines 28-41).
|
|
|
|
|
|
+ <code>Reducer</code>) implementations (lines 28-43).
|
|
</li>
|
|
</li>
|
|
<li>
|
|
<li>
|
|
Demonstrates how the <code>DistributedCache</code> can be used to
|
|
Demonstrates how the <code>DistributedCache</code> can be used to
|
|
distribute read-only data needed by the jobs. Here it allows the user
|
|
distribute read-only data needed by the jobs. Here it allows the user
|
|
- to specify word-patterns to skip while counting (line 102).
|
|
|
|
|
|
+ to specify word-patterns to skip while counting (line 104).
|
|
</li>
|
|
</li>
|
|
<li>
|
|
<li>
|
|
Demonstrates the utility of the <code>Tool</code> interface and the
|
|
Demonstrates the utility of the <code>Tool</code> interface and the
|
|
<code>GenericOptionsParser</code> to handle generic Hadoop
|
|
<code>GenericOptionsParser</code> to handle generic Hadoop
|
|
- command-line options (lines 85-86, 116).
|
|
|
|
|
|
+ command-line options (lines 87-116, 119).
|
|
</li>
|
|
</li>
|
|
<li>
|
|
<li>
|
|
- Demonstrates how applications can use <code>Counters</code> (line 66)
|
|
|
|
|
|
+ Demonstrates how applications can use <code>Counters</code> (line 68)
|
|
and how they can set application-specific status information via
|
|
and how they can set application-specific status information via
|
|
the <code>Reporter</code> instance passed to the <code>map</code> (and
|
|
the <code>Reporter</code> instance passed to the <code>map</code> (and
|
|
- <code>reduce</code>) method (line 70).
|
|
|
|
|
|
+ <code>reduce</code>) method (line 72).
|
|
</li>
|
|
</li>
|
|
</ul>
|
|
</ul>
|
|
|
|
|