|
@@ -1,9 +1,9 @@
|
|
|
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
|
|
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
|
|
<html>
|
|
|
<head>
|
|
|
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
|
|
<meta content="Apache Forrest" name="Generator">
|
|
|
-<meta name="Forrest-version" content="0.7">
|
|
|
+<meta name="Forrest-version" content="0.8">
|
|
|
<meta name="Forrest-skin-name" content="pelt">
|
|
|
<title>Hadoop Map-Reduce Tutorial</title>
|
|
|
<link type="text/css" href="skin/basic.css" rel="stylesheet">
|
|
@@ -16,46 +16,91 @@
|
|
|
<body onload="init()">
|
|
|
<script type="text/javascript">ndeSetTextSize();</script>
|
|
|
<div id="top">
|
|
|
+<!--+
|
|
|
+ |breadtrail
|
|
|
+ +-->
|
|
|
<div class="breadtrail">
|
|
|
<a href="http://www.apache.org/">Apache</a> > <a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/hadoop/">Hadoop</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
|
|
|
</div>
|
|
|
+<!--+
|
|
|
+ |header
|
|
|
+ +-->
|
|
|
<div class="header">
|
|
|
+<!--+
|
|
|
+ |start group logo
|
|
|
+ +-->
|
|
|
<div class="grouplogo">
|
|
|
<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
|
|
|
</div>
|
|
|
+<!--+
|
|
|
+ |end group logo
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Project Logo
|
|
|
+ +-->
|
|
|
<div class="projectlogo">
|
|
|
<a href="http://lucene.apache.org/hadoop/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Scalable Computing Platform"></a>
|
|
|
</div>
|
|
|
+<!--+
|
|
|
+ |end Project Logo
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Search
|
|
|
+ +-->
|
|
|
<div class="searchbox">
|
|
|
<form action="http://www.google.com/search" method="get" class="roundtopsmall">
|
|
|
<input value="lucene.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">
|
|
|
- <input attr="value" name="Search" value="Search" type="submit">
|
|
|
+ <input name="Search" value="Search" type="submit">
|
|
|
</form>
|
|
|
</div>
|
|
|
+<!--+
|
|
|
+ |end search
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Tabs
|
|
|
+ +-->
|
|
|
<ul id="tabs">
|
|
|
<li>
|
|
|
-<a class="base-not-selected" href="http://lucene.apache.org/hadoop/">Project</a>
|
|
|
+<a class="unselected" href="http://lucene.apache.org/hadoop/">Project</a>
|
|
|
</li>
|
|
|
<li>
|
|
|
-<a class="base-not-selected" href="http://wiki.apache.org/lucene-hadoop">Wiki</a>
|
|
|
+<a class="unselected" href="http://wiki.apache.org/lucene-hadoop">Wiki</a>
|
|
|
</li>
|
|
|
<li class="current">
|
|
|
-<a class="base-selected" href="index.html">Hadoop 0.16 Documentation</a>
|
|
|
+<a class="selected" href="index.html">Hadoop 0.16 Documentation</a>
|
|
|
</li>
|
|
|
</ul>
|
|
|
+<!--+
|
|
|
+ |end Tabs
|
|
|
+ +-->
|
|
|
</div>
|
|
|
</div>
|
|
|
<div id="main">
|
|
|
<div id="publishedStrip">
|
|
|
+<!--+
|
|
|
+ |start Subtabs
|
|
|
+ +-->
|
|
|
<div id="level2tabs"></div>
|
|
|
+<!--+
|
|
|
+ |end Endtabs
|
|
|
+ +-->
|
|
|
<script type="text/javascript"><!--
|
|
|
-document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
+document.write("Last Published: " + document.lastModified);
|
|
|
// --></script>
|
|
|
</div>
|
|
|
+<!--+
|
|
|
+ |breadtrail
|
|
|
+ +-->
|
|
|
<div class="breadtrail">
|
|
|
-
|
|
|
+
|
|
|
|
|
|
</div>
|
|
|
+<!--+
|
|
|
+ |start Menu, mainarea
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Menu
|
|
|
+ +-->
|
|
|
<div id="menu">
|
|
|
<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Documentation</div>
|
|
|
<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
|
|
@@ -75,6 +120,9 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
<div class="menupagetitle">Map-Reduce Tutorial</div>
|
|
|
</div>
|
|
|
<div class="menuitem">
|
|
|
+<a href="native_libraries.html">Native Hadoop Libraries</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
<a href="streaming.html">Streaming</a>
|
|
|
</div>
|
|
|
<div class="menuitem">
|
|
@@ -96,8 +144,17 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
<div id="credit"></div>
|
|
|
<div id="roundbottom">
|
|
|
<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
|
|
|
+<!--+
|
|
|
+ |alternative credits
|
|
|
+ +-->
|
|
|
<div id="credit2"></div>
|
|
|
</div>
|
|
|
+<!--+
|
|
|
+ |end Menu
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start content
|
|
|
+ +-->
|
|
|
<div id="content">
|
|
|
<div title="Portable Document Format" class="pdflink">
|
|
|
<a class="dida" href="mapred_tutorial.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
|
|
@@ -206,6 +263,9 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
<li>
|
|
|
<a href="#JobControl">JobControl</a>
|
|
|
</li>
|
|
|
+<li>
|
|
|
+<a href="#Data+Compression">Data Compression</a>
|
|
|
+</li>
|
|
|
</ul>
|
|
|
</li>
|
|
|
</ul>
|
|
@@ -214,7 +274,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
<a href="#Example%3A+WordCount+v2.0">Example: WordCount v2.0</a>
|
|
|
<ul class="minitoc">
|
|
|
<li>
|
|
|
-<a href="#Source+Code-N10A91">Source Code</a>
|
|
|
+<a href="#Source+Code-N10B1F">Source Code</a>
|
|
|
</li>
|
|
|
<li>
|
|
|
<a href="#Sample+Runs">Sample Runs</a>
|
|
@@ -1513,9 +1573,14 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
individual task.</p>
|
|
|
<p>
|
|
|
<a href="api/org/apache/hadoop/mapred/TextInputFormat.html">
|
|
|
- TextInputFormat</a> is the default <span class="codefrag">InputFormat</span>.
|
|
|
- </p>
|
|
|
-<a name="N108E9"></a><a name="InputSplit"></a>
|
|
|
+ TextInputFormat</a> is the default <span class="codefrag">InputFormat</span>.</p>
|
|
|
+<p>If <span class="codefrag">TextInputFormat</span> is the <span class="codefrag">InputFormat</span> for a
|
|
|
+ given job, the framework detects input-files with the <em>.gz</em> and
|
|
|
+ <em>.lzo</em> extensions and automatically decompresses them using the
|
|
|
+ appropriate <span class="codefrag">CompressionCodec</span>. However, it must be noted that
|
|
|
+ compressed files with the above extensions cannot be <em>split</em> and
|
|
|
+ each compressed file is processed in its entirety by a single mapper.</p>
|
|
|
+<a name="N108FE"></a><a name="InputSplit"></a>
|
|
|
<h4>InputSplit</h4>
|
|
|
<p>
|
|
|
<a href="api/org/apache/hadoop/mapred/InputSplit.html">
|
|
@@ -1529,7 +1594,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
FileSplit</a> is the default <span class="codefrag">InputSplit</span>. It sets
|
|
|
<span class="codefrag">map.input.file</span> to the path of the input file for the
|
|
|
logical split.</p>
|
|
|
-<a name="N1090E"></a><a name="RecordReader"></a>
|
|
|
+<a name="N10923"></a><a name="RecordReader"></a>
|
|
|
<h4>RecordReader</h4>
|
|
|
<p>
|
|
|
<a href="api/org/apache/hadoop/mapred/RecordReader.html">
|
|
@@ -1541,7 +1606,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
for processing. <span class="codefrag">RecordReader</span> thus assumes the
|
|
|
responsibility of processing record boundaries and presents the tasks
|
|
|
with keys and values.</p>
|
|
|
-<a name="N10931"></a><a name="Job+Output"></a>
|
|
|
+<a name="N10946"></a><a name="Job+Output"></a>
|
|
|
<h3 class="h4">Job Output</h3>
|
|
|
<p>
|
|
|
<a href="api/org/apache/hadoop/mapred/OutputFormat.html">
|
|
@@ -1566,7 +1631,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
<p>
|
|
|
<span class="codefrag">TextOutputFormat</span> is the default
|
|
|
<span class="codefrag">OutputFormat</span>.</p>
|
|
|
-<a name="N1095A"></a><a name="Task+Side-Effect+Files"></a>
|
|
|
+<a name="N1096F"></a><a name="Task+Side-Effect+Files"></a>
|
|
|
<h4>Task Side-Effect Files</h4>
|
|
|
<p>In some applications, component tasks need to create and/or write to
|
|
|
side-files, which differ from the actual job-output files.</p>
|
|
@@ -1592,7 +1657,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
JobConf.getOutputPath()</a>, and the framework will promote them
|
|
|
similarly for succesful task-attempts, thus eliminating the need to
|
|
|
pick unique paths per task-attempt.</p>
|
|
|
-<a name="N1098F"></a><a name="RecordWriter"></a>
|
|
|
+<a name="N109A4"></a><a name="RecordWriter"></a>
|
|
|
<h4>RecordWriter</h4>
|
|
|
<p>
|
|
|
<a href="api/org/apache/hadoop/mapred/RecordWriter.html">
|
|
@@ -1600,9 +1665,9 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
pairs to an output file.</p>
|
|
|
<p>RecordWriter implementations write the job outputs to the
|
|
|
<span class="codefrag">FileSystem</span>.</p>
|
|
|
-<a name="N109A6"></a><a name="Other+Useful+Features"></a>
|
|
|
+<a name="N109BB"></a><a name="Other+Useful+Features"></a>
|
|
|
<h3 class="h4">Other Useful Features</h3>
|
|
|
-<a name="N109AC"></a><a name="Counters"></a>
|
|
|
+<a name="N109C1"></a><a name="Counters"></a>
|
|
|
<h4>Counters</h4>
|
|
|
<p>
|
|
|
<span class="codefrag">Counters</span> represent global counters, defined either by
|
|
@@ -1616,7 +1681,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
Reporter.incrCounter(Enum, long)</a> in the <span class="codefrag">map</span> and/or
|
|
|
<span class="codefrag">reduce</span> methods. These counters are then globally
|
|
|
aggregated by the framework.</p>
|
|
|
-<a name="N109D7"></a><a name="DistributedCache"></a>
|
|
|
+<a name="N109EC"></a><a name="DistributedCache"></a>
|
|
|
<h4>DistributedCache</h4>
|
|
|
<p>
|
|
|
<a href="api/org/apache/hadoop/filecache/DistributedCache.html">
|
|
@@ -1648,7 +1713,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
<span class="codefrag">DistributedCache</span> tracks the modification timestamps of
|
|
|
the cached files. Clearly the cache files should not be modified by
|
|
|
the application or externally while the job is executing.</p>
|
|
|
-<a name="N10A11"></a><a name="Tool"></a>
|
|
|
+<a name="N10A26"></a><a name="Tool"></a>
|
|
|
<h4>Tool</h4>
|
|
|
<p>The <a href="api/org/apache/hadoop/util/Tool.html">Tool</a>
|
|
|
interface supports the handling of generic Hadoop command-line options.
|
|
@@ -1688,7 +1753,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
</span>
|
|
|
|
|
|
</p>
|
|
|
-<a name="N10A43"></a><a name="IsolationRunner"></a>
|
|
|
+<a name="N10A58"></a><a name="IsolationRunner"></a>
|
|
|
<h4>IsolationRunner</h4>
|
|
|
<p>
|
|
|
<a href="api/org/apache/hadoop/mapred/IsolationRunner.html">
|
|
@@ -1712,21 +1777,73 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
<p>
|
|
|
<span class="codefrag">IsolationRunner</span> will run the failed task in a single
|
|
|
jvm, which can be in the debugger, over precisely the same input.</p>
|
|
|
-<a name="N10A76"></a><a name="JobControl"></a>
|
|
|
+<a name="N10A8B"></a><a name="JobControl"></a>
|
|
|
<h4>JobControl</h4>
|
|
|
<p>
|
|
|
<a href="api/org/apache/hadoop/mapred/jobcontrol/package-summary.html">
|
|
|
JobControl</a> is a utility which encapsulates a set of Map-Reduce jobs
|
|
|
and their dependencies.</p>
|
|
|
+<a name="N10A98"></a><a name="Data+Compression"></a>
|
|
|
+<h4>Data Compression</h4>
|
|
|
+<p>Hadoop Map-Reduce provides facilities for the application-writer to
|
|
|
+ specify compression for both intermediate map-outputs and the
|
|
|
+ job-outputs i.e. output of the reduces. It also comes bundled with
|
|
|
+ <a href="api/org/apache/hadoop/io/compress/CompressionCodec.html">
|
|
|
+ CompressionCodec</a> implementations for the
|
|
|
+ <a href="http://www.zlib.net/">zlib</a> and <a href="http://www.oberhumer.com/opensource/lzo/">lzo</a> compression
|
|
|
+ algorithms. The <a href="http://www.gzip.org/">gzip</a> file format is also
|
|
|
+ supported.</p>
|
|
|
+<p>Hadoop also provides native implementations of the above compression
|
|
|
+ codecs for reasons of both performance (zlib) and non-availability of
|
|
|
+ Java libraries (lzo). More details on their usage and availability are
|
|
|
+ available <a href="native_libraries.html">here</a>.</p>
|
|
|
+<a name="N10AB8"></a><a name="Intermediate+Outputs"></a>
|
|
|
+<h5>Intermediate Outputs</h5>
|
|
|
+<p>Applications can control compression of intermediate map-outputs
|
|
|
+ via the
|
|
|
+ <a href="api/org/apache/hadoop/mapred/JobConf.html#setCompressMapOutput(boolean)">
|
|
|
+ JobConf.setCompressMapOutput(boolean)</a> api and the
|
|
|
+ <span class="codefrag">CompressionCodec</span> to be used via the
|
|
|
+ <a href="api/org/apache/hadoop/mapred/JobConf.html#setMapOutputCompressorClass(java.lang.Class)">
|
|
|
+ JobConf.setMapOutputCompressorClass(Class)</a> api. Since
|
|
|
+ the intermediate map-outputs are always stored in the
|
|
|
+ <a href="api/org/apache/hadoop/io/SequenceFile.html">SequenceFile</a>
|
|
|
+ format, the
|
|
|
+ <a href="api/org/apache/hadoop/io/SequenceFile.CompressionType.html">
|
|
|
+ SequenceFile.CompressionType</a> (i.e.
|
|
|
+ <a href="api/org/apache/hadoop/io/SequenceFile.CompressionType.html#RECORD">
|
|
|
+ RECORD</a> /
|
|
|
+ <a href="api/org/apache/hadoop/io/SequenceFile.CompressionType.html#BLOCK">
|
|
|
+ BLOCK</a> - defaults to <span class="codefrag">RECORD</span>) can be specified via the
|
|
|
+ <a href="api/org/apache/hadoop/mapred/JobConf.html#setMapOutputCompressionType(org.apache.hadoop.io.SequenceFile.CompressionType)">
|
|
|
+ JobConf.setMapOutputCompressionType(SequenceFile.CompressionType)</a>
|
|
|
+ api.</p>
|
|
|
+<a name="N10AE4"></a><a name="Job+Outputs"></a>
|
|
|
+<h5>Job Outputs</h5>
|
|
|
+<p>Applications can control compression of job-outputs via the
|
|
|
+ <a href="api/org/apache/hadoop/mapred/OutputFormatBase.html#setCompressOutput(org.apache.hadoop.mapred.JobConf,%20boolean)">
|
|
|
+ OutputFormatBase.setCompressOutput(JobConf, boolean)</a> api and the
|
|
|
+ <span class="codefrag">CompressionCodec</span> to be used can be specified via the
|
|
|
+ <a href="api/org/apache/hadoop/mapred/OutputFormatBase.html#setOutputCompressorClass(org.apache.hadoop.mapred.JobConf,%20java.lang.Class)">
|
|
|
+ OutputFormatBase.setOutputCompressorClass(JobConf, Class)</a> api.</p>
|
|
|
+<p>If the job outputs are to be stored in the
|
|
|
+ <a href="api/org/apache/hadoop/mapred/SequenceFileOutputFormat.html">
|
|
|
+ SequenceFileOutputFormat</a>, the required
|
|
|
+ <span class="codefrag">SequenceFile.CompressionType</span> (i.e. <span class="codefrag">RECORD</span> /
|
|
|
+ <span class="codefrag">BLOCK</span> - defaults to <span class="codefrag">RECORD</span>)can be specified
|
|
|
+ via the
|
|
|
+ <a href="api/org/apache/hadoop/mapred/SequenceFileOutputFormat.html#setOutputCompressionType(org.apache.hadoop.mapred.JobConf,%20org.apache.hadoop.io.SequenceFile.CompressionType)">
|
|
|
+ SequenceFileOutputFormat.setOutputCompressionType(JobConf,
|
|
|
+ SequenceFile.CompressionType)</a> api.</p>
|
|
|
</div>
|
|
|
|
|
|
|
|
|
-<a name="N10A85"></a><a name="Example%3A+WordCount+v2.0"></a>
|
|
|
+<a name="N10B13"></a><a name="Example%3A+WordCount+v2.0"></a>
|
|
|
<h2 class="h3">Example: WordCount v2.0</h2>
|
|
|
<div class="section">
|
|
|
<p>Here is a more complete <span class="codefrag">WordCount</span> which uses many of the
|
|
|
features provided by the Map-Reduce framework we discussed so far:</p>
|
|
|
-<a name="N10A91"></a><a name="Source+Code-N10A91"></a>
|
|
|
+<a name="N10B1F"></a><a name="Source+Code-N10B1F"></a>
|
|
|
<h3 class="h4">Source Code</h3>
|
|
|
<table class="ForrestTable" cellspacing="1" cellpadding="4">
|
|
|
|
|
@@ -2904,7 +3021,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
</tr>
|
|
|
|
|
|
</table>
|
|
|
-<a name="N111C3"></a><a name="Sample+Runs"></a>
|
|
|
+<a name="N11251"></a><a name="Sample+Runs"></a>
|
|
|
<h3 class="h4">Sample Runs</h3>
|
|
|
<p>Sample text-files as input:</p>
|
|
|
<p>
|
|
@@ -3069,7 +3186,7 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
<br>
|
|
|
|
|
|
</p>
|
|
|
-<a name="N11293"></a><a name="Salient+Points"></a>
|
|
|
+<a name="N11321"></a><a name="Salient+Points"></a>
|
|
|
<h3 class="h4">Salient Points</h3>
|
|
|
<p>The second version of <span class="codefrag">WordCount</span> improves upon the
|
|
|
previous one by using some features offered by the Map-Reduce framework:
|
|
@@ -3114,18 +3231,27 @@ document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
|
|
|
|
|
|
</div>
|
|
|
+<!--+
|
|
|
+ |end content
|
|
|
+ +-->
|
|
|
<div class="clearboth"> </div>
|
|
|
</div>
|
|
|
<div id="footer">
|
|
|
+<!--+
|
|
|
+ |start bottomstrip
|
|
|
+ +-->
|
|
|
<div class="lastmodified">
|
|
|
<script type="text/javascript"><!--
|
|
|
-document.write("<text>Last Published:</text> " + document.lastModified);
|
|
|
+document.write("Last Published: " + document.lastModified);
|
|
|
// --></script>
|
|
|
</div>
|
|
|
<div class="copyright">
|
|
|
Copyright ©
|
|
|
2007 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
|
|
|
</div>
|
|
|
+<!--+
|
|
|
+ |end bottomstrip
|
|
|
+ +-->
|
|
|
</div>
|
|
|
</body>
|
|
|
</html>
|