17 lat temu · 3a8540be12
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -884,6 +884,9 @@ Release 0.19.0 - Unreleased
 
				     HADOOP-4278. Increase debug logging for unit test TestDatanodeDeath.
			
 
				     (dhruba)
			
 
				 
			
 
				+    HADOOP-4418. Updates documentation in forrest for Mapred, streaming and pipes.
			
 
				+    (Amareshwari Sriramadasu via ddas)
			
 
				+
			
 
				 Release 0.18.2 - Unreleased
			
 
				 
			
 
				   BUG FIXES
			
--- a/conf/hadoop-default.xml
+++ b/conf/hadoop-default.xml
@@ -763,7 +763,7 @@ creations/deletions), or "all".</description>
 
				 
			
 
				 <property>
			
 
				   <name>mapred.jobtracker.job.history.block.size</name>
			
 
				-  <value>3145728></value>
			
 
				+  <value>3145728</value>
			
 
				   <description>The block size of the job history file. Since the job recovery
			
 
				                uses job history, its important to dump job history to disk as 
			
 
				                soon as possible. Note that this is an expert level parameter.
			
--- a/docs/cluster_setup.html
+++ b/docs/cluster_setup.html
@@ -231,6 +231,14 @@ document.write("Last Published: " + document.lastModified);
 
				 </ul>
			
 
				 </li>
			
 
				 <li>
			
 
				+<a href="#Cluster+Restartability">Cluster Restartability</a>
			
 
				+<ul class="minitoc">
			
 
				+<li>
			
 
				+<a href="#Map%2FReduce">Map/Reduce</a>
			
 
				+</li>
			
 
				+</ul>
			
 
				+</li>
			
 
				+<li>
			
 
				 <a href="#Hadoop+Rack+Awareness">Hadoop Rack Awareness</a>
			
 
				 </li>
			
 
				 <li>
			
@@ -726,8 +734,21 @@ document.write("Last Published: " + document.lastModified);
 
				       typically <span class="codefrag">${HADOOP_HOME}/conf</span>.</p>
			
 
				 </div>
			
 
				     
			
 
				+<a name="N10398"></a><a name="Cluster+Restartability"></a>
			
 
				+<h2 class="h3">Cluster Restartability</h2>
			
 
				+<div class="section">
			
 
				+<a name="N1039E"></a><a name="Map%2FReduce"></a>
			
 
				+<h3 class="h4">Map/Reduce</h3>
			
 
				+<p>The job tracker restart can recover running jobs if 
			
 
				+        <span class="codefrag">mapred.jobtracker.restart.recover</span> is set true and 
			
 
				+        <a href="#Logging">JobHistory logging</a> is enabled. Also 
			
 
				+        <span class="codefrag">mapred.jobtracker.job.history.block.size</span> value should be 
			
 
				+        set to an optimal value to dump job history to disk as soon as 
			
 
				+        possible, the typical value is 3145728(3MB).</p>
			
 
				+</div>
			
 
				+    
			
 
				     
			
 
				-<a name="N10398"></a><a name="Hadoop+Rack+Awareness"></a>
			
 
				+<a name="N103B3"></a><a name="Hadoop+Rack+Awareness"></a>
			
 
				 <h2 class="h3">Hadoop Rack Awareness</h2>
			
 
				 <div class="section">
			
 
				 <p>The HDFS and the Map/Reduce components are rack-aware.</p>
			
@@ -750,7 +771,7 @@ document.write("Last Published: " + document.lastModified);
 
				 </div>
			
 
				     
			
 
				     
			
 
				-<a name="N103BE"></a><a name="Hadoop+Startup"></a>
			
 
				+<a name="N103D9"></a><a name="Hadoop+Startup"></a>
			
 
				 <h2 class="h3">Hadoop Startup</h2>
			
 
				 <div class="section">
			
 
				 <p>To start a Hadoop cluster you will need to start both the HDFS and 
			
@@ -785,7 +806,7 @@ document.write("Last Published: " + document.lastModified);
 
				 </div>
			
 
				     
			
 
				     
			
 
				-<a name="N10404"></a><a name="Hadoop+Shutdown"></a>
			
 
				+<a name="N1041F"></a><a name="Hadoop+Shutdown"></a>
			
 
				 <h2 class="h3">Hadoop Shutdown</h2>
			
 
				 <div class="section">
			
 
				 <p>
			
--- a/docs/cluster_setup.pdf
+++ b/docs/cluster_setup.pdf
--- a/docs/hadoop-default.html
+++ b/docs/hadoop-default.html
@@ -11,12 +11,11 @@
 
				 <td><a name="hadoop.native.lib">hadoop.native.lib</a></td><td>true</td><td>Should native hadoop libraries, if present, be used.</td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td><a name="hadoop.http.filter.initializers">hadoop.http.filter.initializers</a></td><td></td><td>A comma separated list of class names.  Each class in the list must extend org.apache.hadoop.http.FilterInitializer.
			
 
				-  
			
 
				-  The corresponding Filter will be initialized.  Then, the Filter will be applied to all user facing jsp and servlet web pages.  The ordering of the list defines the ordering of the filters.
			
 
				-
			
 
				-  The value can be empty.
			
 
				-  </td>
			
 
				+<td><a name="hadoop.http.filter.initializers">hadoop.http.filter.initializers</a></td><td></td><td>A comma separated list of class names. Each class in the list 
			
 
				+  must extend org.apache.hadoop.http.FilterInitializer. The corresponding 
			
 
				+  Filter will be initialized. Then, the Filter will be applied to all user 
			
 
				+  facing jsp and servlet web pages.  The ordering of the list defines the 
			
 
				+  ordering of the filters.</td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				 <td><a name="hadoop.logfile.size">hadoop.logfile.size</a></td><td>10000000</td><td>The max size of each log file</td>
			
@@ -343,10 +342,12 @@ creations/deletions), or "all".</td>
 
				   </td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td><a name="dfs.namenode.decommission.interval">dfs.namenode.decommission.interval</a></td><td>300</td><td>Namenode periodicity in seconds to check if decommission is complete.</td>
			
 
				+<td><a name="dfs.namenode.decommission.interval">dfs.namenode.decommission.interval</a></td><td>300</td><td>Namenode periodicity in seconds to check if decommission is 
			
 
				+  complete.</td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td><a name="dfs.replication.interval">dfs.replication.interval</a></td><td>3</td><td>The periodicity in seconds with which the namenode computes repliaction work for datanodes. </td>
			
 
				+<td><a name="dfs.replication.interval">dfs.replication.interval</a></td><td>3</td><td>The periodicity in seconds with which the namenode computes 
			
 
				+  repliaction work for datanodes. </td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				 <td><a name="dfs.access.time.precision">dfs.access.time.precision</a></td><td>3600000</td><td>The access time for HDFS file is precise upto this value. 
			
@@ -470,7 +471,7 @@ creations/deletions), or "all".</td>
 
				   </td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td><a name="mapred.jobtracker.job.history.block.size">mapred.jobtracker.job.history.block.size</a></td><td>3145728&gt;</td><td>The block size of the job history file. Since the job recovery
			
 
				+<td><a name="mapred.jobtracker.job.history.block.size">mapred.jobtracker.job.history.block.size</a></td><td>3145728</td><td>The block size of the job history file. Since the job recovery
			
 
				                uses job history, its important to dump job history to disk as 
			
 
				                soon as possible. Note that this is an expert level parameter.
			
 
				                The default value is set to 3 MB.
			
@@ -523,8 +524,8 @@ creations/deletions), or "all".</td>
 
				   </td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td><a name="mapred.jobtracker.completeuserjobs.maximum">mapred.jobtracker.completeuserjobs.maximum</a></td><td>100</td><td>The maximum number of complete jobs per user to keep around before delegating them to the job history.
			
 
				-  </td>
			
 
				+<td><a name="mapred.jobtracker.completeuserjobs.maximum">mapred.jobtracker.completeuserjobs.maximum</a></td><td>100</td><td>The maximum number of complete jobs per user to keep around 
			
 
				+  before delegating them to the job history.</td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				 <td><a name="mapred.jobtracker.instrumentation">mapred.jobtracker.instrumentation</a></td><td>org.apache.hadoop.mapred.JobTrackerMetricsInst</td><td>Expert: The instrumentation class to associate with each JobTracker.
			
@@ -608,6 +609,10 @@ creations/deletions), or "all".</td>
 
				   take priority over this setting.</td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				+<td><a name="mapred.jobtracker.maxtasks.per.job">mapred.jobtracker.maxtasks.per.job</a></td><td>-1</td><td>The maximum number of tasks for a single job.
			
 
				+  A value of -1 indicates that there is no maximum.  </td>
			
 
				+</tr>
			
 
				+<tr>
			
 
				 <td><a name="mapred.submit.replication">mapred.submit.replication</a></td><td>10</td><td>The replication level for submitted job files.  This
			
 
				   should be around the square root of the number of nodes.
			
 
				   </td>
			
--- a/docs/mapred_tutorial.html
+++ b/docs/mapred_tutorial.html
@@ -252,11 +252,20 @@ document.write("Last Published: " + document.lastModified);
 
				 <a href="#Task+Execution+%26+Environment">Task Execution &amp; Environment</a>
			
 
				 <ul class="minitoc">
			
 
				 <li>
			
 
				+<a href="#Memory+management"> Memory management</a>
			
 
				+</li>
			
 
				+<li>
			
 
				 <a href="#Map+Parameters">Map Parameters</a>
			
 
				 </li>
			
 
				 <li>
			
 
				 <a href="#Shuffle%2FReduce+Parameters">Shuffle/Reduce Parameters</a>
			
 
				 </li>
			
 
				+<li>
			
 
				+<a href="#Directory+Structure"> Directory Structure </a>
			
 
				+</li>
			
 
				+<li>
			
 
				+<a href="#Task+JVM+Reuse">Task JVM Reuse</a>
			
 
				+</li>
			
 
				 </ul>
			
 
				 </li>
			
 
				 <li>
			
@@ -330,7 +339,7 @@ document.write("Last Published: " + document.lastModified);
 
				 <a href="#Example%3A+WordCount+v2.0">Example: WordCount v2.0</a>
			
 
				 <ul class="minitoc">
			
 
				 <li>
			
 
				-<a href="#Source+Code-N10F78">Source Code</a>
			
 
				+<a href="#Source+Code-N10F9A">Source Code</a>
			
 
				 </li>
			
 
				 <li>
			
 
				 <a href="#Sample+Runs">Sample Runs</a>
			
@@ -1588,6 +1597,8 @@ document.write("Last Published: " + document.lastModified);
 
				 <span class="codefrag">&lt;/property&gt;</span>
			
 
				         
			
 
				 </p>
			
 
				+<a name="N108BF"></a><a name="Memory+management"></a>
			
 
				+<h4> Memory management</h4>
			
 
				 <p>Users/admins can also specify the maximum virtual memory 
			
 
				         of the launched child-task, and any sub-process it launches 
			
 
				         recursively, using <span class="codefrag">mapred.child.ulimit</span>. Note that
			
@@ -1629,7 +1640,7 @@ document.write("Last Published: " + document.lastModified);
 
				         counters for a job- particularly relative to byte counts from the map
			
 
				         and into the reduce- is invaluable to the tuning of these
			
 
				         parameters.</p>
			
 
				-<a name="N108E9"></a><a name="Map+Parameters"></a>
			
 
				+<a name="N108F0"></a><a name="Map+Parameters"></a>
			
 
				 <h4>Map Parameters</h4>
			
 
				 <p>A record emitted from a map will be serialized into a buffer and
			
 
				           metadata will be stored into accounting buffers. As described in the
			
@@ -1703,7 +1714,7 @@ document.write("Last Published: " + document.lastModified);
 
				             combiner.</li>
			
 
				           
			
 
				 </ul>
			
 
				-<a name="N10955"></a><a name="Shuffle%2FReduce+Parameters"></a>
			
 
				+<a name="N1095C"></a><a name="Shuffle%2FReduce+Parameters"></a>
			
 
				 <h4>Shuffle/Reduce Parameters</h4>
			
 
				 <p>As described previously, each reduce fetches the output assigned
			
 
				           to it by the Partitioner via HTTP into memory and periodically
			
@@ -1799,6 +1810,8 @@ document.write("Last Published: " + document.lastModified);
 
				             of the intermediate merge.</li>
			
 
				           
			
 
				 </ul>
			
 
				+<a name="N109D7"></a><a name="Directory+Structure"></a>
			
 
				+<h4> Directory Structure </h4>
			
 
				 <p>The task tracker has local directory,
			
 
				         <span class="codefrag"> ${mapred.local.dir}/taskTracker/</span> to create localized
			
 
				         cache and localized job. It can define multiple local directories 
			
@@ -1869,7 +1882,9 @@ document.write("Last Published: " + document.lastModified);
 
				         
			
 
				 <li>
			
 
				 <span class="codefrag">${mapred.local.dir}/taskTracker/jobcache/$jobid/$taskid/work</span>
			
 
				-        : The curernt working directory of the task. </li>
			
 
				+        : The curernt working directory of the task. 
			
 
				+        With <a href="#Task+JVM+Reuse">jvm reuse</a> enabled for tasks, this 
			
 
				+        directory will be the directory on which the jvm has started</li>
			
 
				         
			
 
				 <li>
			
 
				 <span class="codefrag">${mapred.local.dir}/taskTracker/jobcache/$jobid/$taskid/work/tmp</span>
			
@@ -1896,6 +1911,17 @@ document.write("Last Published: " + document.lastModified);
 
				 </li>
			
 
				         
			
 
				 </ul>
			
 
				+<a name="N10A46"></a><a name="Task+JVM+Reuse"></a>
			
 
				+<h4>Task JVM Reuse</h4>
			
 
				+<p>Jobs can enable task JVMs to be reused by specifying the job 
			
 
				+        configuration <span class="codefrag">mapred.job.reuse.jvm.num.tasks</span>. If the
			
 
				+        value is 1 (the default), then JVMs are not reused 
			
 
				+        (i.e. 1 task per JVM). If it is -1, there is no limit to the number
			
 
				+        of tasks a JVM can run (of the same job). One can also specify some
			
 
				+        value greater than 1 using the api 
			
 
				+        <a href="api/org/apache/hadoop/mapred/JobConf.html#setNumTasksToExecutePerJvm(int)">
			
 
				+        JobConf.setNumTasksToExecutePerJvm(int)</a>
			
 
				+</p>
			
 
				 <p>The following properties are localized in the job configuration 
			
 
				          for each task's execution: </p>
			
 
				 <table class="ForrestTable" cellspacing="1" cellpadding="4">
			
@@ -1977,7 +2003,7 @@ document.write("Last Published: " + document.lastModified);
 
				         <a href="native_libraries.html#Loading+native+libraries+through+DistributedCache">
			
 
				         native_libraries.html</a>
			
 
				 </p>
			
 
				-<a name="N10B0D"></a><a name="Job+Submission+and+Monitoring"></a>
			
 
				+<a name="N10B2F"></a><a name="Job+Submission+and+Monitoring"></a>
			
 
				 <h3 class="h4">Job Submission and Monitoring</h3>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/mapred/JobClient.html">
			
@@ -2038,7 +2064,7 @@ document.write("Last Published: " + document.lastModified);
 
				 <p>Normally the user creates the application, describes various facets 
			
 
				         of the job via <span class="codefrag">JobConf</span>, and then uses the 
			
 
				         <span class="codefrag">JobClient</span> to submit the job and monitor its progress.</p>
			
 
				-<a name="N10B6D"></a><a name="Job+Control"></a>
			
 
				+<a name="N10B8F"></a><a name="Job+Control"></a>
			
 
				 <h4>Job Control</h4>
			
 
				 <p>Users may need to chain Map/Reduce jobs to accomplish complex
			
 
				           tasks which cannot be done via a single Map/Reduce job. This is fairly
			
@@ -2074,7 +2100,7 @@ document.write("Last Published: " + document.lastModified);
 
				             </li>
			
 
				           
			
 
				 </ul>
			
 
				-<a name="N10B97"></a><a name="Job+Input"></a>
			
 
				+<a name="N10BB9"></a><a name="Job+Input"></a>
			
 
				 <h3 class="h4">Job Input</h3>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/mapred/InputFormat.html">
			
@@ -2122,7 +2148,7 @@ document.write("Last Published: " + document.lastModified);
 
				         appropriate <span class="codefrag">CompressionCodec</span>. However, it must be noted that
			
 
				         compressed files with the above extensions cannot be <em>split</em> and 
			
 
				         each compressed file is processed in its entirety by a single mapper.</p>
			
 
				-<a name="N10C01"></a><a name="InputSplit"></a>
			
 
				+<a name="N10C23"></a><a name="InputSplit"></a>
			
 
				 <h4>InputSplit</h4>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/mapred/InputSplit.html">
			
@@ -2136,7 +2162,7 @@ document.write("Last Published: " + document.lastModified);
 
				           FileSplit</a> is the default <span class="codefrag">InputSplit</span>. It sets 
			
 
				           <span class="codefrag">map.input.file</span> to the path of the input file for the
			
 
				           logical split.</p>
			
 
				-<a name="N10C26"></a><a name="RecordReader"></a>
			
 
				+<a name="N10C48"></a><a name="RecordReader"></a>
			
 
				 <h4>RecordReader</h4>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/mapred/RecordReader.html">
			
@@ -2148,7 +2174,7 @@ document.write("Last Published: " + document.lastModified);
 
				           for processing. <span class="codefrag">RecordReader</span> thus assumes the 
			
 
				           responsibility of processing record boundaries and presents the tasks 
			
 
				           with keys and values.</p>
			
 
				-<a name="N10C49"></a><a name="Job+Output"></a>
			
 
				+<a name="N10C6B"></a><a name="Job+Output"></a>
			
 
				 <h3 class="h4">Job Output</h3>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/mapred/OutputFormat.html">
			
@@ -2173,7 +2199,7 @@ document.write("Last Published: " + document.lastModified);
 
				 <p>
			
 
				 <span class="codefrag">TextOutputFormat</span> is the default 
			
 
				         <span class="codefrag">OutputFormat</span>.</p>
			
 
				-<a name="N10C72"></a><a name="OutputCommitter"></a>
			
 
				+<a name="N10C94"></a><a name="OutputCommitter"></a>
			
 
				 <h4>OutputCommitter</h4>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/mapred/OutputCommitter.html">
			
@@ -2215,7 +2241,7 @@ document.write("Last Published: " + document.lastModified);
 
				 <p>
			
 
				 <span class="codefrag">FileOutputCommitter</span> is the default 
			
 
				         <span class="codefrag">OutputCommitter</span>.</p>
			
 
				-<a name="N10CA2"></a><a name="Task+Side-Effect+Files"></a>
			
 
				+<a name="N10CC4"></a><a name="Task+Side-Effect+Files"></a>
			
 
				 <h4>Task Side-Effect Files</h4>
			
 
				 <p>In some applications, component tasks need to create and/or write to
			
 
				           side-files, which differ from the actual job-output files.</p>
			
@@ -2256,7 +2282,7 @@ document.write("Last Published: " + document.lastModified);
 
				 <p>The entire discussion holds true for maps of jobs with 
			
 
				            reducer=NONE (i.e. 0 reduces) since output of the map, in that case, 
			
 
				            goes directly to HDFS.</p>
			
 
				-<a name="N10CF0"></a><a name="RecordWriter"></a>
			
 
				+<a name="N10D12"></a><a name="RecordWriter"></a>
			
 
				 <h4>RecordWriter</h4>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/mapred/RecordWriter.html">
			
@@ -2264,9 +2290,9 @@ document.write("Last Published: " + document.lastModified);
 
				           pairs to an output file.</p>
			
 
				 <p>RecordWriter implementations write the job outputs to the 
			
 
				           <span class="codefrag">FileSystem</span>.</p>
			
 
				-<a name="N10D07"></a><a name="Other+Useful+Features"></a>
			
 
				+<a name="N10D29"></a><a name="Other+Useful+Features"></a>
			
 
				 <h3 class="h4">Other Useful Features</h3>
			
 
				-<a name="N10D0D"></a><a name="Counters"></a>
			
 
				+<a name="N10D2F"></a><a name="Counters"></a>
			
 
				 <h4>Counters</h4>
			
 
				 <p>
			
 
				 <span class="codefrag">Counters</span> represent global counters, defined either by 
			
@@ -2283,7 +2309,7 @@ document.write("Last Published: " + document.lastModified);
 
				           in the <span class="codefrag">map</span> and/or 
			
 
				           <span class="codefrag">reduce</span> methods. These counters are then globally 
			
 
				           aggregated by the framework.</p>
			
 
				-<a name="N10D3C"></a><a name="DistributedCache"></a>
			
 
				+<a name="N10D5E"></a><a name="DistributedCache"></a>
			
 
				 <h4>DistributedCache</h4>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/filecache/DistributedCache.html">
			
@@ -2354,7 +2380,7 @@ document.write("Last Published: " + document.lastModified);
 
				           <span class="codefrag">mapred.job.classpath.{files|archives}</span>. Similarly the
			
 
				           cached files that are symlinked into the working directory of the
			
 
				           task can be used to distribute native libraries and load them.</p>
			
 
				-<a name="N10DBF"></a><a name="Tool"></a>
			
 
				+<a name="N10DE1"></a><a name="Tool"></a>
			
 
				 <h4>Tool</h4>
			
 
				 <p>The <a href="api/org/apache/hadoop/util/Tool.html">Tool</a> 
			
 
				           interface supports the handling of generic Hadoop command-line options.
			
@@ -2394,7 +2420,7 @@ document.write("Last Published: " + document.lastModified);
 
				             </span>
			
 
				           
			
 
				 </p>
			
 
				-<a name="N10DF1"></a><a name="IsolationRunner"></a>
			
 
				+<a name="N10E13"></a><a name="IsolationRunner"></a>
			
 
				 <h4>IsolationRunner</h4>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/mapred/IsolationRunner.html">
			
@@ -2418,7 +2444,7 @@ document.write("Last Published: " + document.lastModified);
 
				 <p>
			
 
				 <span class="codefrag">IsolationRunner</span> will run the failed task in a single 
			
 
				           jvm, which can be in the debugger, over precisely the same input.</p>
			
 
				-<a name="N10E24"></a><a name="Profiling"></a>
			
 
				+<a name="N10E46"></a><a name="Profiling"></a>
			
 
				 <h4>Profiling</h4>
			
 
				 <p>Profiling is a utility to get a representative (2 or 3) sample
			
 
				           of built-in java profiler for a sample of maps and reduces. </p>
			
@@ -2451,7 +2477,7 @@ document.write("Last Published: " + document.lastModified);
 
				           <span class="codefrag">-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s</span>
			
 
				           
			
 
				 </p>
			
 
				-<a name="N10E58"></a><a name="Debugging"></a>
			
 
				+<a name="N10E7A"></a><a name="Debugging"></a>
			
 
				 <h4>Debugging</h4>
			
 
				 <p>Map/Reduce framework provides a facility to run user-provided 
			
 
				           scripts for debugging. When map/reduce task fails, user can run 
			
@@ -2462,14 +2488,14 @@ document.write("Last Published: " + document.lastModified);
 
				 <p> In the following sections we discuss how to submit debug script
			
 
				           along with the job. For submitting debug script, first it has to
			
 
				           distributed. Then the script has to supplied in Configuration. </p>
			
 
				-<a name="N10E64"></a><a name="How+to+distribute+script+file%3A"></a>
			
 
				+<a name="N10E86"></a><a name="How+to+distribute+script+file%3A"></a>
			
 
				 <h5> How to distribute script file: </h5>
			
 
				 <p>
			
 
				           The user has to use 
			
 
				           <a href="mapred_tutorial.html#DistributedCache">DistributedCache</a>
			
 
				           mechanism to <em>distribute</em> and <em>symlink</em> the
			
 
				           debug script file.</p>
			
 
				-<a name="N10E78"></a><a name="How+to+submit+script%3A"></a>
			
 
				+<a name="N10E9A"></a><a name="How+to+submit+script%3A"></a>
			
 
				 <h5> How to submit script: </h5>
			
 
				 <p> A quick way to submit debug script is to set values for the 
			
 
				           properties "mapred.map.task.debug.script" and 
			
@@ -2493,17 +2519,17 @@ document.write("Last Published: " + document.lastModified);
 
				 <span class="codefrag">$script $stdout $stderr $syslog $jobconf $program </span>  
			
 
				           
			
 
				 </p>
			
 
				-<a name="N10E9A"></a><a name="Default+Behavior%3A"></a>
			
 
				+<a name="N10EBC"></a><a name="Default+Behavior%3A"></a>
			
 
				 <h5> Default Behavior: </h5>
			
 
				 <p> For pipes, a default script is run to process core dumps under
			
 
				           gdb, prints stack trace and gives info about running threads. </p>
			
 
				-<a name="N10EA5"></a><a name="JobControl"></a>
			
 
				+<a name="N10EC7"></a><a name="JobControl"></a>
			
 
				 <h4>JobControl</h4>
			
 
				 <p>
			
 
				 <a href="api/org/apache/hadoop/mapred/jobcontrol/package-summary.html">
			
 
				           JobControl</a> is a utility which encapsulates a set of Map/Reduce jobs
			
 
				           and their dependencies.</p>
			
 
				-<a name="N10EB2"></a><a name="Data+Compression"></a>
			
 
				+<a name="N10ED4"></a><a name="Data+Compression"></a>
			
 
				 <h4>Data Compression</h4>
			
 
				 <p>Hadoop Map/Reduce provides facilities for the application-writer to
			
 
				           specify compression for both intermediate map-outputs and the
			
@@ -2517,7 +2543,7 @@ document.write("Last Published: " + document.lastModified);
 
				           codecs for reasons of both performance (zlib) and non-availability of
			
 
				           Java libraries (lzo). More details on their usage and availability are
			
 
				           available <a href="native_libraries.html">here</a>.</p>
			
 
				-<a name="N10ED2"></a><a name="Intermediate+Outputs"></a>
			
 
				+<a name="N10EF4"></a><a name="Intermediate+Outputs"></a>
			
 
				 <h5>Intermediate Outputs</h5>
			
 
				 <p>Applications can control compression of intermediate map-outputs
			
 
				             via the 
			
@@ -2526,7 +2552,7 @@ document.write("Last Published: " + document.lastModified);
 
				             <span class="codefrag">CompressionCodec</span> to be used via the
			
 
				             <a href="api/org/apache/hadoop/mapred/JobConf.html#setMapOutputCompressorClass(java.lang.Class)">
			
 
				             JobConf.setMapOutputCompressorClass(Class)</a> api.</p>
			
 
				-<a name="N10EE7"></a><a name="Job+Outputs"></a>
			
 
				+<a name="N10F09"></a><a name="Job+Outputs"></a>
			
 
				 <h5>Job Outputs</h5>
			
 
				 <p>Applications can control compression of job-outputs via the
			
 
				             <a href="api/org/apache/hadoop/mapred/FileOutputFormat.html#setCompressOutput(org.apache.hadoop.mapred.JobConf,%20boolean)">
			
@@ -2543,7 +2569,7 @@ document.write("Last Published: " + document.lastModified);
 
				             <a href="api/org/apache/hadoop/mapred/SequenceFileOutputFormat.html#setOutputCompressionType(org.apache.hadoop.mapred.JobConf,%20org.apache.hadoop.io.SequenceFile.CompressionType)">
			
 
				             SequenceFileOutputFormat.setOutputCompressionType(JobConf, 
			
 
				             SequenceFile.CompressionType)</a> api.</p>
			
 
				-<a name="N10F14"></a><a name="Skipping+Bad+Records"></a>
			
 
				+<a name="N10F36"></a><a name="Skipping+Bad+Records"></a>
			
 
				 <h4>Skipping Bad Records</h4>
			
 
				 <p>Hadoop provides an optional mode of execution in which the bad 
			
 
				           records are detected and skipped in further attempts. 
			
@@ -2617,7 +2643,7 @@ document.write("Last Published: " + document.lastModified);
 
				 </div>
			
 
				 
			
 
				     
			
 
				-<a name="N10F5E"></a><a name="Example%3A+WordCount+v2.0"></a>
			
 
				+<a name="N10F80"></a><a name="Example%3A+WordCount+v2.0"></a>
			
 
				 <h2 class="h3">Example: WordCount v2.0</h2>
			
 
				 <div class="section">
			
 
				 <p>Here is a more complete <span class="codefrag">WordCount</span> which uses many of the
			
@@ -2627,7 +2653,7 @@ document.write("Last Published: " + document.lastModified);
 
				       <a href="quickstart.html#SingleNodeSetup">pseudo-distributed</a> or
			
 
				       <a href="quickstart.html#Fully-Distributed+Operation">fully-distributed</a> 
			
 
				       Hadoop installation.</p>
			
 
				-<a name="N10F78"></a><a name="Source+Code-N10F78"></a>
			
 
				+<a name="N10F9A"></a><a name="Source+Code-N10F9A"></a>
			
 
				 <h3 class="h4">Source Code</h3>
			
 
				 <table class="ForrestTable" cellspacing="1" cellpadding="4">
			
 
				           
			
@@ -3837,7 +3863,7 @@ document.write("Last Published: " + document.lastModified);
 
				 </tr>
			
 
				         
			
 
				 </table>
			
 
				-<a name="N116DA"></a><a name="Sample+Runs"></a>
			
 
				+<a name="N116FC"></a><a name="Sample+Runs"></a>
			
 
				 <h3 class="h4">Sample Runs</h3>
			
 
				 <p>Sample text-files as input:</p>
			
 
				 <p>
			
@@ -4005,7 +4031,7 @@ document.write("Last Published: " + document.lastModified);
 
				 <br>
			
 
				         
			
 
				 </p>
			
 
				-<a name="N117AE"></a><a name="Highlights"></a>
			
 
				+<a name="N117D0"></a><a name="Highlights"></a>
			
 
				 <h3 class="h4">Highlights</h3>
			
 
				 <p>The second version of <span class="codefrag">WordCount</span> improves upon the 
			
 
				         previous one by using some features offered by the Map/Reduce framework:
			
--- a/docs/mapred_tutorial.pdf
+++ b/docs/mapred_tutorial.pdf
--- a/docs/native_libraries.html
+++ b/docs/native_libraries.html
@@ -253,14 +253,18 @@ document.write("Last Published: " + document.lastModified);
 
				 <li>
			
 
				 <a href="http://www.oberhumer.com/opensource/lzo/">lzo</a>
			
 
				 </li>
			
 
				+        
			
 
				+<li>
			
 
				+<a href="http://www.bzip.org/">bzip2</a>
			
 
				+</li>
			
 
				       
			
 
				 </ul>
			
 
				 <p>Of the above, the availability of native hadoop libraries is imperative 
			
 
				-      for the lzo and gzip compression codecs to work.</p>
			
 
				+      for the lzo, gzip and bzip2 compression codecs to work.</p>
			
 
				 </div>
			
 
				 
			
 
				     
			
 
				-<a name="N1003D"></a><a name="Usage"></a>
			
 
				+<a name="N10042"></a><a name="Usage"></a>
			
 
				 <h2 class="h3">Usage</h2>
			
 
				 <div class="section">
			
 
				 <p>It is fairly simple to use the native hadoop libraries:</p>
			
@@ -279,7 +283,8 @@ document.write("Last Published: " + document.lastModified);
 
				         </li>
			
 
				         
			
 
				 <li>
			
 
				-          Make sure you have either or both of <strong>&gt;zlib-1.2</strong> and 
			
 
				+          Make sure you have any of or all of <strong>&gt;zlib-1.2</strong>,
			
 
				+          <strong>&gt;gzip-1.2</strong>, <strong>&gt;bzip2-1.0</strong> and 
			
 
				           <strong>&gt;lzo2.0</strong> packages for your platform installed; 
			
 
				           depending on your needs.
			
 
				         </li>
			
@@ -314,7 +319,7 @@ document.write("Last Published: " + document.lastModified);
 
				 </div>
			
 
				     
			
 
				     
			
 
				-<a name="N10087"></a><a name="Supported+Platforms"></a>
			
 
				+<a name="N10092"></a><a name="Supported+Platforms"></a>
			
 
				 <h2 class="h3">Supported Platforms</h2>
			
 
				 <div class="section">
			
 
				 <p>Hadoop native library is supported only on *nix platforms only.
			
@@ -344,7 +349,7 @@ document.write("Last Published: " + document.lastModified);
 
				 </div>
			
 
				     
			
 
				     
			
 
				-<a name="N100B7"></a><a name="Building+Native+Hadoop+Libraries"></a>
			
 
				+<a name="N100C2"></a><a name="Building+Native+Hadoop+Libraries"></a>
			
 
				 <h2 class="h3">Building Native Hadoop Libraries</h2>
			
 
				 <div class="section">
			
 
				 <p>Hadoop native library is written in 
			
@@ -393,15 +398,16 @@ document.write("Last Published: " + document.lastModified);
 
				 <p>where &lt;platform&gt; is combination of the system-properties: 
			
 
				       <span class="codefrag">${os.name}-${os.arch}-${sun.arch.data.model}</span>; for e.g. 
			
 
				       Linux-i386-32.</p>
			
 
				-<a name="N1010A"></a><a name="Notes"></a>
			
 
				+<a name="N10115"></a><a name="Notes"></a>
			
 
				 <h3 class="h4">Notes</h3>
			
 
				 <ul>
			
 
				           
			
 
				 <li>
			
 
				-            It is <strong>mandatory</strong> to have both the zlib and lzo 
			
 
				+            It is <strong>mandatory</strong> to have the 
			
 
				+            zlib, gzip, bzip2 and lzo 
			
 
				             development packages on the target platform for building the 
			
 
				             native hadoop library; however for deployment it is sufficient to 
			
 
				-            install zlib or lzo if you wish to use only one of them.
			
 
				+            install one of them if you wish to use only one of them.
			
 
				           </li>
			
 
				           
			
 
				 <li>
			
@@ -413,7 +419,7 @@ document.write("Last Published: " + document.lastModified);
 
				 </ul>
			
 
				 </div>
			
 
				     
			
 
				-<a name="N1011E"></a><a name="Loading+native+libraries+through+DistributedCache"></a>
			
 
				+<a name="N10129"></a><a name="Loading+native+libraries+through+DistributedCache"></a>
			
 
				 <h2 class="h3"> Loading native libraries through DistributedCache </h2>
			
 
				 <div class="section">
			
 
				 <p>User can load native shared libraries through  
			
--- a/docs/native_libraries.pdf
+++ b/docs/native_libraries.pdf
--- a/docs/quickstart.html
+++ b/docs/quickstart.html
@@ -418,7 +418,7 @@ document.write("Last Published: " + document.lastModified);
 
				 </tr>
			
 
				             
			
 
				 <tr>
			
 
				-<td colspan="1" rowspan="1">&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;localhost:9000&lt;/value&gt;</td>
			
 
				+<td colspan="1" rowspan="1">&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;hdfs://localhost:9000&lt;/value&gt;</td>
			
 
				 </tr>
			
 
				           
			
 
				 <tr>
			
--- a/docs/quickstart.pdf
+++ b/docs/quickstart.pdf
@@ -256,10 +256,10 @@ endobj
 
				 >>
			
 
				 endobj
			
 
				 44 0 obj
			
 
				-<< /Length 1692 /Filter [ /ASCII85Decode /FlateDecode ]
			
 
				+<< /Length 1702 /Filter [ /ASCII85Decode /FlateDecode ]
			
 
				  >>
			
 
				 stream
			
 
				-Gat=,?#SIU'Rf_ZcpN@2^d:[.WK/Y;d&]gf@S>.gDsIt](2[il78K*4Gsh>\G9oism>s66"?&[5nD;&-hIBI8UVY2gnE7T.B(5UULgB(&5nLgjU"/f>`>""_E,tB*#@@\e-O8SF8!k@6;nu?,-7#b`?EA^i.!"8W#/(r5_$A=c/NSb>1ndpj$AcAB\/nEDIX"FR_KB[L\";V[.ZC5,\"9i4s&C._##7F3jC;i0FFH<`>ifG6o5]2#melL353l`W'A;V\VPY(8Supa7-X4+A;PSA63`RR8YS6dqEl1A3N%OkUZp4qiZLsq[<Vpf=pR65&1>0c[Uo%/6,u=V*A?s3J1(nILP_W,c/t[&uZ0#*q0k$!N.*?`gKY/;I?F*OE/j$&VluZFYLnGee=i5#-&q+#`]$=uJ&T(V!n8tA'JZVm+1XV'mOC!BHO-aB&gd[VAp[Olh^im;VWG$u/UlYOX_^1WfD,F>uiWK3`Z;o0`oS49Y11udeStE;224:"57Wqt/Yi!WUm.$6tK\L2%HUf[/4)%!-\a6+IZdW[NqRnH$(,!/IBU_ok]RCWl"L_7Ja@%B_M>UZqA<3q91iNh.8F6#STY8-A\f@E<'aAi^4Y4Ym0"KJFMm=P#G@3]Cm"'2k>,2g4VBS+`5ADN,R>fY&.M:IZJ*c]gN.fpk"_Wbrs#FpVHf=AA0b`!XK5i!VZ%bn(2;4q7I(,I(W.F>80f%.6io5EQ@uZgc,lF#^EP4qro@V?-L(f\De(HdcMlN%SGT/]802HGsZ.1;KO'EA%c/8S1F:Q@GqNQ@#`67hRCT^c:XQDMnpjs;Nf3]55KDlE&G%3fX[cccYb=RZIUEjN(Fei]0I+ORXm_u%r2OX0Y9@k`XXR'+pYuZr;45dUf,h)h,C[?N4]<4k3cG`uO?ogU1m8T)T'-[JJg'PHD&:pnOC)T_@-CRLI9MjM6q_U/<OGVCPKE"+%+1b5H(S@d%D@gt@((7B=Z^J[*/=.n\MUU"i/gJk=GO`PZ1S/`;(M.f]'MQ(-fNl7?X7T3"l[OZq@LglJ\K4q\g1,`p#Z2%p#qHH^ctY`3DI$P]K!@2sS>Ul.j;I)XS17PZ6n!$7[TIsS=E!I0!#9FR73"Rh'@PI+LL!;aM@l$`@1p\X[j#07rOU([;Q!)iP^,<E-';G+;_X/I;=`hjLQ-^A1:4S+M8s<gS?kV><S(=>(,D"kok?>NNgC9ZhD)B/Sk><8L1S!?fM7=oV8uCb=YGe$C&KT?.c1D6/C;[#NC:H66DTSs3mC>MBr0=POB60KVi#b<o!BsY1@g\j)Q2l<!fEadTB2PlmbWG<G^+W8q;LB*LJP[65;G_1VW]*+8CF#(8s+$R)ILgZ2e@]G._U@UbHkYr,j`@nWUC)h,Beb/*e(Zq4p9i%,Esq\N-g='^7HQ$0qtc2lWe65P<EgY=Eur1an$sNm+H#h(jQWLE%c""/]]It?pBS>8=oI'loN1EcAftn_j/t-moBO6i0!Cib7NH^:aZD'no3VcGIpinL+!O?QZ;_1SPEJu=`^-02*2ibjG*?)[*In'muXOccNdT%AFh%\g''oRPOt!^pg-ReYhm('L)kFCFHtJ0PbL.1:.0V^j7,?jeM^39o3)c/Ut&YeeE^;jFec+i<2aWG?a5IAIM1+M;bPd:gkp6?e=/:j$sP9o7\..fg:ZA>2kH[;]Dhlr`d4N~>
			
 
				+Gat=,?#SIU'Rf_ZcpN@2^d:[.WK/S9d&]sj@S>.gDsIt](*.1q78K*4AOH4HG9oisjcDC."?&s=nD;&-hBPp"PL50-kg,CS4Mg4@.!Ca="@3%FKN>k6.&JDgJWk.f3r1VC3][6G5/QplScYM#2^*0ppL4Te7?It-0^.$e%mGN8CXH!aTa@hf<^?"/_=_d4&&V$q'!>_#^@iPM;IM=&\b1nepojOe1N3E\,9K"2Q1NbP`B"fNSsM*sVp=bVjKhT.23FG)AnkMCi82$IFL$Tn/i?g&V,N)B!'>c#,<'F<ak2JXc3om+3SQ)X<4+C!Fg"pq-'Oln7VZ+l>QPWo5UL9Z,U;8+6?sD:mIL(mQ5b:`8ERneGe=_H[TnW_4HCZV`E6G'$`9f.;p7p5-Q.mL)PEHT]]LEcL\7[jNQ)g_G%$Akf:*]28ej,qXXF2fA4uW@8OgR6F:%IK)sRU8e&>!L.h!RgCrdMI\^Y;NlCo!]g!GVYF.T$h`"\IsRm$B@QkQI@m$'92dgCg-5u(sOQXhAs7P!BNGG:Z4Gt0HbQci,R1HV^>Y!:!ZCuLOMNOC8ncPpI^8mj_\N(e>fON8MS62()B9EVcNKuDgZ$gIo(dcCHDXY?tn*FM1aE64qG>Efj/]fcaTa#hH>HC<&\R>U?us/;@PA27'3W^Q>!s09U;1(nIL'HB"(r9(TOoYT@^@j\":#NHtn@h"c0C:$]Lq<fBK;$5RP@Uu2J`Po_G`u6VQ8GF?6id-dma0M8pN+X?(Vb`ep/uhs@nTF&?^-''q``@jT5B9U41kPm`d4(T+lbi/4""uV!Y.8.5Z!QLmi(IAj?%5^&%/N>5fSp&SfH-Y0/13/h:R%LUerc[]nm,%S\eWsPfkeK;cX!k^Yd/&XLlI!sfcJ2i+JB"t?"<Vrn%,rA\I]^VKBWM!hl.S7-:@pr[1dig+Tba(eMNN_9f/%rR%UsKp1\C[+;:T*#60&(5BNIo00\-Tg`Yrg/=0h8AkraqZ!+C`.?7%+]8TQ>h:XT6H/_uYG?/n%PCb5"m^q@ph^\Trme=J+">Rr2nlQF)=nS0?3'pT^AB%5/'XZ^j]1\KWY@)/K3p5bb4F)Uj'ZZ(G#8%<u\1L`g.Fo%&%=j<pefkTsdKsW'dDIR\2T:JDLbOZrdVFuKR!r=O>cH0t/]p2H_NII8js_.e92Y91d@6,D3&7kO'T&`%2I<UULiq<Nm1qI>E:n*,ljf?mY$Num8KLKiQ\K(`bLfRInWh1U03Krg6#$9KMCRL$>;"=K2N:,<0LDRp1HE`S/SV'2TTIc?/Z4JODtC=e\ME`@3?"g\_ZG^jW6b%Z5<&&fhqd*Qrd=J1nbo@m^E8gf!E8Xb"!hU+B;N[V3Z;@$RMTRiXt%,8G0@4$D^=\cj4*gUreMTRS%E#kHlmu[`<"*u*ol++Lp*^I98k97q'm38_IS^35#q#G<q:X`Aj-%8%3oeb#&[u]ZiujD#4,tWS]I#oO'UpJdCE,b2dEo;nnp<m#B!]5?A@AbKK::OQA6E7]%O460LTmd]8Vq,Rrn@/aI;FBWnHo-+?-A\V0,8:^I,$X5OW0[ONijaC)7S<:""C^`-Wqi_KttU)KsaVcrJq3;i9Rb3Y;DlNQ;&Eeh;=F[&>:%79Zf^9YIT_>rV%P9nga0]/?(Wm0>@!\$^m+caDD98\I\_b-:!5CXfm&V9KC>J)^k#[/1B:Y^GL~>
			
 
				 endstream
			
 
				 endobj
			
 
				 45 0 obj
			
@@ -582,39 +582,39 @@ endobj
 
				 xref
			
 
				 0 73
			
 
				 0000000000 65535 f 
			
 
				-0000015775 00000 n 
			
 
				-0000015861 00000 n 
			
 
				-0000015953 00000 n 
			
 
				+0000015785 00000 n 
			
 
				+0000015871 00000 n 
			
 
				+0000015963 00000 n 
			
 
				 0000000015 00000 n 
			
 
				 0000000071 00000 n 
			
 
				 0000001009 00000 n 
			
 
				 0000001129 00000 n 
			
 
				 0000001238 00000 n 
			
 
				-0000016098 00000 n 
			
 
				+0000016108 00000 n 
			
 
				 0000001373 00000 n 
			
 
				-0000016161 00000 n 
			
 
				+0000016171 00000 n 
			
 
				 0000001510 00000 n 
			
 
				-0000016227 00000 n 
			
 
				+0000016237 00000 n 
			
 
				 0000001645 00000 n 
			
 
				-0000016293 00000 n 
			
 
				+0000016303 00000 n 
			
 
				 0000001782 00000 n 
			
 
				-0000016359 00000 n 
			
 
				+0000016369 00000 n 
			
 
				 0000001919 00000 n 
			
 
				-0000016425 00000 n 
			
 
				+0000016435 00000 n 
			
 
				 0000002056 00000 n 
			
 
				-0000016491 00000 n 
			
 
				+0000016501 00000 n 
			
 
				 0000002193 00000 n 
			
 
				-0000016555 00000 n 
			
 
				+0000016565 00000 n 
			
 
				 0000002330 00000 n 
			
 
				-0000016621 00000 n 
			
 
				+0000016631 00000 n 
			
 
				 0000002467 00000 n 
			
 
				-0000016687 00000 n 
			
 
				+0000016697 00000 n 
			
 
				 0000002604 00000 n 
			
 
				-0000016753 00000 n 
			
 
				+0000016763 00000 n 
			
 
				 0000002741 00000 n 
			
 
				-0000016818 00000 n 
			
 
				+0000016828 00000 n 
			
 
				 0000002877 00000 n 
			
 
				-0000016884 00000 n 
			
 
				+0000016894 00000 n 
			
 
				 0000003014 00000 n 
			
 
				 0000005152 00000 n 
			
 
				 0000005275 00000 n 
			
@@ -626,34 +626,34 @@ xref
 
				 0000007964 00000 n 
			
 
				 0000007991 00000 n 
			
 
				 0000008183 00000 n 
			
 
				-0000009968 00000 n 
			
 
				-0000010076 00000 n 
			
 
				-0000011465 00000 n 
			
 
				-0000011588 00000 n 
			
 
				-0000011629 00000 n 
			
 
				-0000011801 00000 n 
			
 
				-0000011973 00000 n 
			
 
				-0000016948 00000 n 
			
 
				-0000012145 00000 n 
			
 
				-0000012278 00000 n 
			
 
				-0000012507 00000 n 
			
 
				-0000012722 00000 n 
			
 
				-0000012939 00000 n 
			
 
				-0000013154 00000 n 
			
 
				-0000013308 00000 n 
			
 
				-0000013619 00000 n 
			
 
				-0000013844 00000 n 
			
 
				-0000014156 00000 n 
			
 
				-0000014336 00000 n 
			
 
				-0000014594 00000 n 
			
 
				-0000014750 00000 n 
			
 
				-0000015002 00000 n 
			
 
				-0000015115 00000 n 
			
 
				-0000015225 00000 n 
			
 
				-0000015336 00000 n 
			
 
				-0000015444 00000 n 
			
 
				-0000015550 00000 n 
			
 
				-0000015666 00000 n 
			
 
				+0000009978 00000 n 
			
 
				+0000010086 00000 n 
			
 
				+0000011475 00000 n 
			
 
				+0000011598 00000 n 
			
 
				+0000011639 00000 n 
			
 
				+0000011811 00000 n 
			
 
				+0000011983 00000 n 
			
 
				+0000016958 00000 n 
			
 
				+0000012155 00000 n 
			
 
				+0000012288 00000 n 
			
 
				+0000012517 00000 n 
			
 
				+0000012732 00000 n 
			
 
				+0000012949 00000 n 
			
 
				+0000013164 00000 n 
			
 
				+0000013318 00000 n 
			
 
				+0000013629 00000 n 
			
 
				+0000013854 00000 n 
			
 
				+0000014166 00000 n 
			
 
				+0000014346 00000 n 
			
 
				+0000014604 00000 n 
			
 
				+0000014760 00000 n 
			
 
				+0000015012 00000 n 
			
 
				+0000015125 00000 n 
			
 
				+0000015235 00000 n 
			
 
				+0000015346 00000 n 
			
 
				+0000015454 00000 n 
			
 
				+0000015560 00000 n 
			
 
				+0000015676 00000 n 
			
 
				 trailer
			
 
				 <<
			
 
				 /Size 73
			
@@ -661,5 +661,5 @@ trailer
 
				 /Info 4 0 R
			
 
				 >>
			
 
				 startxref
			
 
				-16999
			
 
				+17009
			
 
				 %%EOF
			
--- a/docs/streaming.html
+++ b/docs/streaming.html
@@ -238,6 +238,9 @@ document.write("Last Published: " + document.lastModified);
 
				 <a href="#A+Useful+Partitioner+Class+%28secondary+sort%2C+the+-partitioner+org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner+option%29">A Useful Partitioner Class (secondary sort, the -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner option) </a>
			
 
				 </li>
			
 
				 <li>
			
 
				+<a href="#A+Useful+Comparator+Class">A Useful Comparator Class</a>
			
 
				+</li>
			
 
				+<li>
			
 
				 <a href="#Working+with+the+Hadoop+Aggregate+Package+%28the+-reduce+aggregate+option%29">Working with the Hadoop Aggregate Package (the -reduce aggregate option) </a>
			
 
				 </li>
			
 
				 <li>
			
@@ -615,10 +618,17 @@ In the above example, "-D stream.map.output.field.separator=." specifies "." as
 
				 <p>
			
 
				 Similarly, you can use "-D stream.reduce.output.field.separator=SEP" and "-D stream.num.reduce.output.fields=NUM" to specify the nth field separator in a line of the reduce outputs as the separator between the key and the value.
			
 
				 </p>
			
 
				-<a name="N101D9"></a><a name="A+Useful+Partitioner+Class+%28secondary+sort%2C+the+-partitioner+org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner+option%29"></a>
			
 
				+<p> Similarly, you can specify "stream.map.input.field.separator" and 
			
 
				+"stream.reduce.input.field.separator" as the input separator for map/reduce 
			
 
				+inputs. By default the separator is the tab character.</p>
			
 
				+<a name="N101DC"></a><a name="A+Useful+Partitioner+Class+%28secondary+sort%2C+the+-partitioner+org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner+option%29"></a>
			
 
				 <h3 class="h4">A Useful Partitioner Class (secondary sort, the -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner option) </h3>
			
 
				 <p>
			
 
				-Hadoop has a library class, org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner, that is useful for many applications. This class allows the Map/Reduce framework to partition the map outputs based on prefixes of keys, not the whole keys. For example:
			
 
				+Hadoop has a library class, 
			
 
				+<a href="api/org/apache/hadoop/mapred/lib/KeyFieldBasedPartitioner.html">KeyFieldBasedPartitioner</a>, 
			
 
				+that is useful for many applications. This class allows the Map/Reduce 
			
 
				+framework to partition the map outputs based on certain key fields, not
			
 
				+the whole keys. For example:
			
 
				 </p>
			
 
				 <pre class="code">
			
 
				 $HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
			
@@ -630,14 +640,20 @@ $HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
 
				     -D stream.map.output.field.separator=. \
			
 
				     -D stream.num.map.output.key.fields=4 \
			
 
				     -D map.output.key.field.separator=. \
			
 
				-    -D num.key.fields.for.partition=2 \
			
 
				+    -D mapred.text.key.partitioner.options=-k1,2\
			
 
				     -D mapred.reduce.tasks=12
			
 
				 </pre>
			
 
				 <p>
			
 
				 Here, <em>-D stream.map.output.field.separator=.</em> and <em>-D stream.num.map.output.key.fields=4</em> are as explained in previous example. The two variables are used by streaming to identify the key/value pair of mapper. 
			
 
				 </p>
			
 
				 <p>
			
 
				-The map output keys of the above Map/Reduce job normally have four fields separated by ".". However, the Map/Reduce framework will partition the map outputs by the first two fields of the keys using the <em>-D num.key.fields.for.partition=2</em> option. Here, <em>-D map.output.key.field.separator=.</em> specifies the separator for the partition. This guarantees that all the key/value pairs with the same first two fields in the keys will be partitioned into the same reducer.
			
 
				+The map output keys of the above Map/Reduce job normally have four fields
			
 
				+separated by ".". However, the Map/Reduce framework will partition the map
			
 
				+outputs by the first two fields of the keys using the 
			
 
				+<em>-D mapred.text.key.partitioner.options=-k1,2</em> option. 
			
 
				+Here, <em>-D map.output.key.field.separator=.</em> specifies the separator 
			
 
				+for the partition. This guarantees that all the key/value pairs with the 
			
 
				+same first two fields in the keys will be partitioned into the same reducer.
			
 
				 </p>
			
 
				 <p>
			
 
				 
			
@@ -675,12 +691,59 @@ Sorting within each partition for the reducer(all 4 fields used for sorting)</p>
 
				 11.14.2.2
			
 
				 11.14.2.3
			
 
				 </pre>
			
 
				-<a name="N1020F"></a><a name="Working+with+the+Hadoop+Aggregate+Package+%28the+-reduce+aggregate+option%29"></a>
			
 
				+<a name="N10216"></a><a name="A+Useful+Comparator+Class"></a>
			
 
				+<h3 class="h4">A Useful Comparator Class</h3>
			
 
				+<p>
			
 
				+Hadoop has a library class, 
			
 
				+<a href="api/org/apache/hadoop/mapred/lib/KeyFieldBasedComparator.html">KeyFieldBasedComparator</a>, 
			
 
				+that is useful for many applications. This class provides a subset of features
			
 
				+provided by the Unix/GNU Sort. For example:
			
 
				+</p>
			
 
				+<pre class="code">
			
 
				+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
			
 
				+    -input myInputDirs \
			
 
				+    -output myOutputDir \
			
 
				+    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
			
 
				+    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
			
 
				+    -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
			
 
				+    -D stream.map.output.field.separator=. \
			
 
				+    -D stream.num.map.output.key.fields=4 \
			
 
				+    -D map.output.key.field.separator=. \
			
 
				+    -D mapred.text.key.comparator.options=-k2,2nr\
			
 
				+    -D mapred.reduce.tasks=12
			
 
				+</pre>
			
 
				+<p>
			
 
				+The map output keys of the above Map/Reduce job normally have four fields
			
 
				+separated by ".". However, the Map/Reduce framework will sort the 
			
 
				+outputs by the second field of the keys using the 
			
 
				+<em>-D mapred.text.key.comparator.options=-k2,2nr</em> option. 
			
 
				+Here, <em>-n</em> specifies that the sorting is numerical sorting and 
			
 
				+<em>-r</em> specifies that the result should be reversed. A simple illustration
			
 
				+is shown below:
			
 
				+</p>
			
 
				+<p>
			
 
				+Output of map (the keys)</p>
			
 
				+<pre class="code">
			
 
				+11.12.1.2
			
 
				+11.14.2.3
			
 
				+11.11.4.1
			
 
				+11.12.1.1
			
 
				+11.14.2.2
			
 
				+</pre>
			
 
				+<p>
			
 
				+Sorting output for the reducer(where second field used for sorting)</p>
			
 
				+<pre class="code">
			
 
				+11.14.2.3
			
 
				+11.14.2.2
			
 
				+11.12.1.2
			
 
				+11.12.1.1
			
 
				+11.11.4.1
			
 
				+</pre>
			
 
				+<a name="N10242"></a><a name="Working+with+the+Hadoop+Aggregate+Package+%28the+-reduce+aggregate+option%29"></a>
			
 
				 <h3 class="h4">Working with the Hadoop Aggregate Package (the -reduce aggregate option) </h3>
			
 
				 <p>
			
 
				-Hadoop has a library package called "Aggregate" (
			
 
				-<a href="https://svn.apache.org/repos/asf/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/lib/aggregate">
			
 
				-https://svn.apache.org/repos/asf/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/lib/aggregate</a>).
			
 
				+Hadoop has a library package called 
			
 
				+<a href="api/org/apache/hadoop/mapred/lib/aggregate/package-summary.html">Aggregate</a>.
			
 
				 Aggregate provides a special reducer class and a special combiner class, and
			
 
				 a list of simple aggregators that perform aggregations such as "sum", "max",
			
 
				 "min" and so on  over a sequence of values. Aggregate allows you to define a
			
@@ -724,7 +787,7 @@ def main(argv):
 
				 if __name__ == "__main__":
			
 
				      main(sys.argv)
			
 
				 </pre>
			
 
				-<a name="N1022A"></a><a name="Field+Selection+%28+similar+to+unix+%27cut%27+command%29"></a>
			
 
				+<a name="N1025D"></a><a name="Field+Selection+%28+similar+to+unix+%27cut%27+command%29"></a>
			
 
				 <h3 class="h4">Field Selection ( similar to unix 'cut' command) </h3>
			
 
				 <p>
			
 
				 Hadoop has a library class, org.apache.hadoop.mapred.lib.FieldSelectionMapReduce, that effectively allows you to process text data like the unix "cut" utility. The map function defined in the class treats each input key/value pair as a list of fields. You can specify the field separator (the default is the tab character). You can select an arbitrary list of fields as the map output key, and an arbitrary list of fields as the map output value. Similarly, the reduce function defined in the class treats each input key/value pair as a list of fields. You can select an arbitrary list of fields as the reduce output key, and an arbitrary list of fields as the reduce output value. For example:
			
@@ -737,7 +800,7 @@ $HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
 
				     -reducer org.apache.hadoop.mapred.lib.FieldSelectionMapReduce\
			
 
				     -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
			
 
				     -D map.output.key.field.separa=. \
			
 
				-    -D num.key.fields.for.partition=2 \
			
 
				+    -D mapred.text.key.partitioner.options=-k1,2 \
			
 
				     -D mapred.data.field.separator=. \
			
 
				     -D map.output.key.value.fields.spec=6,5,1-3:0- \
			
 
				     -D reduce.output.key.value.fields.spec=0-2:5- \
			
@@ -748,20 +811,24 @@ The option "-D map.output.key.value.fields.spec=6,5,1-3:0-" specifies key/value
 
				 the subsequent fields). 
			
 
				 </p>
			
 
				 <p>
			
 
				-The option "-D reduce.output.key.value.fields.spec=0-2:0-" specifies key/value selection for the reduce outputs. In this case, the reduce output key will consist of fields 0, 1, 2 (corresponding to the original fields 6, 5, 1). The reduce output value will consist of all fields starting from field 5 (corresponding to all the original fields).  
			
 
				+The option "-D reduce.output.key.value.fields.spec=0-2:5-" specifies 
			
 
				+key/value selection for the reduce outputs. In this case, the reduce 
			
 
				+output key will consist of fields 0, 1, 2 (corresponding to the original 
			
 
				+fields 6, 5, 1). The reduce output value will consist of all fields starting
			
 
				+from field 5 (corresponding to all the original fields).  
			
 
				 </p>
			
 
				 </div>
			
 
				 
			
 
				 
			
 
				-<a name="N1023E"></a><a name="Frequently+Asked+Questions"></a>
			
 
				+<a name="N10271"></a><a name="Frequently+Asked+Questions"></a>
			
 
				 <h2 class="h3">Frequently Asked Questions </h2>
			
 
				 <div class="section">
			
 
				-<a name="N10244"></a><a name="How+do+I+use+Hadoop+Streaming+to+run+an+arbitrary+set+of+%28semi-%29independent+tasks%3F"></a>
			
 
				+<a name="N10277"></a><a name="How+do+I+use+Hadoop+Streaming+to+run+an+arbitrary+set+of+%28semi-%29independent+tasks%3F"></a>
			
 
				 <h3 class="h4">How do I use Hadoop Streaming to run an arbitrary set of (semi-)independent tasks? </h3>
			
 
				 <p>
			
 
				 Often you do not need the full power of Map Reduce, but only need to run multiple instances of the same program - either on different parts of the data, or on the same data, but with different parameters. You can use Hadoop Streaming to do this.
			
 
				 </p>
			
 
				-<a name="N1024E"></a><a name="How+do+I+process+files%2C+one+per+map%3F"></a>
			
 
				+<a name="N10281"></a><a name="How+do+I+process+files%2C+one+per+map%3F"></a>
			
 
				 <h3 class="h4">How do I process files, one per map? </h3>
			
 
				 <p>
			
 
				 As an example, consider the problem of zipping (compressing) a set of files across the hadoop cluster. You can achieve this using either of these methods:
			
@@ -805,13 +872,13 @@ As an example, consider the problem of zipping (compressing) a set of files acro
 
				 </li>
			
 
				 
			
 
				 </ol>
			
 
				-<a name="N10279"></a><a name="How+many+reducers+should+I+use%3F"></a>
			
 
				+<a name="N102AC"></a><a name="How+many+reducers+should+I+use%3F"></a>
			
 
				 <h3 class="h4">How many reducers should I use? </h3>
			
 
				 <p>
			
 
				 See the Hadoop Wiki for details: <a href="mapred_tutorial.html#Reducer">Reducer</a>
			
 
				 
			
 
				 </p>
			
 
				-<a name="N10287"></a><a name="If+I+set+up+an+alias+in+my+shell+script%2C+will+that+work+after+-mapper%2C+i.e.+say+I+do%3A+alias+c1%3D%27cut+-f1%27.+Will+-mapper+%22c1%22+work%3F"></a>
			
 
				+<a name="N102BA"></a><a name="If+I+set+up+an+alias+in+my+shell+script%2C+will+that+work+after+-mapper%2C+i.e.+say+I+do%3A+alias+c1%3D%27cut+-f1%27.+Will+-mapper+%22c1%22+work%3F"></a>
			
 
				 <h3 class="h4">If I set up an alias in my shell script, will that work after -mapper, i.e. say I do: alias c1='cut -f1'. Will -mapper "c1" work? </h3>
			
 
				 <p>
			
 
				 Using an alias will not work, but variable substitution is allowed as shown in this example:
			
@@ -838,12 +905,12 @@ $ hadoop dfs -cat samples/student_out/part-00000
 
				 75
			
 
				 80
			
 
				 </pre>
			
 
				-<a name="N10295"></a><a name="Can+I+use+UNIX+pipes%3F+For+example%2C+will+-mapper+%22cut+-f1+%7C+sed+s%2Ffoo%2Fbar%2Fg%22+work%3F"></a>
			
 
				+<a name="N102C8"></a><a name="Can+I+use+UNIX+pipes%3F+For+example%2C+will+-mapper+%22cut+-f1+%7C+sed+s%2Ffoo%2Fbar%2Fg%22+work%3F"></a>
			
 
				 <h3 class="h4">Can I use UNIX pipes? For example, will -mapper "cut -f1 | sed s/foo/bar/g" work?</h3>
			
 
				 <p>
			
 
				 Currently this does not work and gives an "java.io.IOException: Broken pipe" error. This is probably a bug that needs to be investigated.
			
 
				 </p>
			
 
				-<a name="N1029F"></a><a name="When+I+run+a+streaming+job+by"></a>
			
 
				+<a name="N102D2"></a><a name="When+I+run+a+streaming+job+by"></a>
			
 
				 <h3 class="h4">When I run a streaming job by distributing large executables (for example, 3.6G) through the -file option, I get a "No space left on device" error. What do I do? </h3>
			
 
				 <p>
			
 
				 The jar packaging happens in a directory pointed to by the configuration variable stream.tmpdir. The default value of stream.tmpdir is /tmp. Set the value to a directory with more space:
			
@@ -851,7 +918,7 @@ The jar packaging happens in a directory pointed to by the configuration variabl
 
				 <pre class="code">
			
 
				 -D stream.tmpdir=/export/bigspace/...
			
 
				 </pre>
			
 
				-<a name="N102B0"></a><a name="How+do+I+specify+multiple+input+directories%3F"></a>
			
 
				+<a name="N102E3"></a><a name="How+do+I+specify+multiple+input+directories%3F"></a>
			
 
				 <h3 class="h4">How do I specify multiple input directories? </h3>
			
 
				 <p>
			
 
				 You can specify multiple input directories with multiple '-input' options:
			
@@ -859,17 +926,17 @@ You can specify multiple input directories with multiple '-input' options:
 
				 <pre class="code">
			
 
				  hadoop jar hadoop-streaming.jar -input '/user/foo/dir1' -input '/user/foo/dir2' 
			
 
				 </pre>
			
 
				-<a name="N102BD"></a><a name="How+do+I+generate+output+files+with+gzip+format%3F"></a>
			
 
				+<a name="N102F0"></a><a name="How+do+I+generate+output+files+with+gzip+format%3F"></a>
			
 
				 <h3 class="h4">How do I generate output files with gzip format? </h3>
			
 
				 <p>
			
 
				 Instead of plain text files, you can generate gzip files as your generated output. Pass '-D mapred.output.compress=true -D  mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCode' as option to your streaming job.
			
 
				 </p>
			
 
				-<a name="N102C7"></a><a name="How+do+I+provide+my+own+input%2Foutput+format+with+streaming%3F"></a>
			
 
				+<a name="N102FA"></a><a name="How+do+I+provide+my+own+input%2Foutput+format+with+streaming%3F"></a>
			
 
				 <h3 class="h4">How do I provide my own input/output format with streaming? </h3>
			
 
				 <p>
			
 
				 At least as late as version 0.14, Hadoop does not support multiple jar files. So, when specifying your own custom classes you will have to pack them along with the streaming jar and use the custom jar instead of the default hadoop streaming jar. 
			
 
				 </p>
			
 
				-<a name="N102D1"></a><a name="How+do+I+parse+XML+documents+using+streaming%3F"></a>
			
 
				+<a name="N10304"></a><a name="How+do+I+parse+XML+documents+using+streaming%3F"></a>
			
 
				 <h3 class="h4">How do I parse XML documents using streaming? </h3>
			
 
				 <p>
			
 
				 You can use the record reader StreamXmlRecordReader to process XML documents. 
			
@@ -880,14 +947,14 @@ hadoop jar hadoop-streaming.jar -inputreader "StreamXmlRecord,begin=BEGIN_STRING
 
				 <p>
			
 
				 Anything found between BEGIN_STRING and END_STRING would be treated as one record for map tasks.
			
 
				 </p>
			
 
				-<a name="N102E2"></a><a name="How+do+I+update+counters+in+streaming+applications%3F"></a>
			
 
				+<a name="N10315"></a><a name="How+do+I+update+counters+in+streaming+applications%3F"></a>
			
 
				 <h3 class="h4">How do I update counters in streaming applications? </h3>
			
 
				 <p>
			
 
				 A streaming process can use the stderr to emit counter information.
			
 
				 <span class="codefrag">reporter:counter:&lt;group&gt;,&lt;counter&gt;,&lt;amount&gt;</span> 
			
 
				 should be sent to stderr to update the counter.
			
 
				 </p>
			
 
				-<a name="N102EF"></a><a name="How+do+I+update+status+in+streaming+applications%3F"></a>
			
 
				+<a name="N10322"></a><a name="How+do+I+update+status+in+streaming+applications%3F"></a>
			
 
				 <h3 class="h4">How do I update status in streaming applications? </h3>
			
 
				 <p>
			
 
				 A streaming process can use the stderr to emit status information.
			
--- a/docs/streaming.pdf
+++ b/docs/streaming.pdf
--- a/src/docs/src/documentation/content/xdocs/cluster_setup.xml
+++ b/src/docs/src/documentation/content/xdocs/cluster_setup.xml
@@ -420,6 +420,18 @@
 
				       to the <code>HADOOP_CONF_DIR</code> directory on all the machines, 
			
 
				       typically <code>${HADOOP_HOME}/conf</code>.</p>
			
 
				     </section>
			
 
				+    <section>
			
 
				+      <title>Cluster Restartability</title>
			
 
				+      <section>
			
 
				+        <title>Map/Reduce</title>
			
 
				+        <p>The job tracker restart can recover running jobs if 
			
 
				+        <code>mapred.jobtracker.restart.recover</code> is set true and 
			
 
				+        <a href="#Logging">JobHistory logging</a> is enabled. Also 
			
 
				+        <code>mapred.jobtracker.job.history.block.size</code> value should be 
			
 
				+        set to an optimal value to dump job history to disk as soon as 
			
 
				+        possible, the typical value is 3145728(3MB).</p>
			
 
				+      </section>
			
 
				+    </section>
			
 
				     
			
 
				     <section>
			
 
				       <title>Hadoop Rack Awareness</title>
			
--- a/src/docs/src/documentation/content/xdocs/mapred_tutorial.xml
+++ b/src/docs/src/documentation/content/xdocs/mapred_tutorial.xml
@@ -1083,6 +1083,8 @@
 
				           <code>&lt;/property&gt;</code>
			
 
				         </p>
			
 
				         
			
 
				+        <section>
			
 
				+        <title> Memory management</title>
			
 
				         <p>Users/admins can also specify the maximum virtual memory 
			
 
				         of the launched child-task, and any sub-process it launches 
			
 
				         recursively, using <code>mapred.child.ulimit</code>. Note that
			
@@ -1126,6 +1128,7 @@
 
				         counters for a job- particularly relative to byte counts from the map
			
 
				         and into the reduce- is invaluable to the tuning of these
			
 
				         parameters.</p>
			
 
				+        </section>
			
 
				 
			
 
				         <section>
			
 
				           <title>Map Parameters</title>
			
@@ -1269,6 +1272,8 @@
 
				 
			
 
				         </section>
			
 
				 
			
 
				+        <section>
			
 
				+        <title> Directory Structure </title>
			
 
				         <p>The task tracker has local directory,
			
 
				         <code> ${mapred.local.dir}/taskTracker/</code> to create localized
			
 
				         cache and localized job. It can define multiple local directories 
			
@@ -1321,7 +1326,9 @@
 
				         temporary map reduce data generated by the framework
			
 
				         such as map output files etc. </li>
			
 
				         <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/$taskid/work</code>
			
 
				-        : The curernt working directory of the task. </li>
			
 
				+        : The curernt working directory of the task. 
			
 
				+        With <a href="#Task+JVM+Reuse">jvm reuse</a> enabled for tasks, this 
			
 
				+        directory will be the directory on which the jvm has started</li>
			
 
				         <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/$taskid/work/tmp</code>
			
 
				         : The temporary directory for the task. 
			
 
				         (User can specify the property <code>mapred.child.tmp</code> to set
			
@@ -1340,6 +1347,19 @@
 
				         </ul>
			
 
				         </li>
			
 
				         </ul>
			
 
				+        </section>
			
 
				+        
			
 
				+        <section>
			
 
				+        <title>Task JVM Reuse</title>
			
 
				+        <p>Jobs can enable task JVMs to be reused by specifying the job 
			
 
				+        configuration <code>mapred.job.reuse.jvm.num.tasks</code>. If the
			
 
				+        value is 1 (the default), then JVMs are not reused 
			
 
				+        (i.e. 1 task per JVM). If it is -1, there is no limit to the number
			
 
				+        of tasks a JVM can run (of the same job). One can also specify some
			
 
				+        value greater than 1 using the api 
			
 
				+        <a href="ext:api/org/apache/hadoop/mapred/jobconf/setnumtaskstoexecuteperjvm">
			
 
				+        JobConf.setNumTasksToExecutePerJvm(int)</a></p>
			
 
				+        </section>
			
 
				 
			
 
				         <p>The following properties are localized in the job configuration 
			
 
				          for each task's execution: </p>
			
--- a/src/docs/src/documentation/content/xdocs/native_libraries.xml
+++ b/src/docs/src/documentation/content/xdocs/native_libraries.xml
@@ -45,10 +45,11 @@
 
				         <li><a href="ext:zlib">zlib</a></li>
			
 
				         <li><a href="ext:gzip">gzip</a></li>
			
 
				         <li><a href="ext:lzo">lzo</a></li>
			
 
				+        <li><a href="ext:bzip">bzip2</a></li>
			
 
				       </ul>
			
 
				       
			
 
				       <p>Of the above, the availability of native hadoop libraries is imperative 
			
 
				-      for the lzo and gzip compression codecs to work.</p>
			
 
				+      for the lzo, gzip and bzip2 compression codecs to work.</p>
			
 
				     </section>
			
 
				 
			
 
				     <section>
			
@@ -68,7 +69,8 @@
 
				           <a href="#Building+Native+Hadoop+Libraries">build</a> them yourself.
			
 
				         </li>
			
 
				         <li>
			
 
				-          Make sure you have either or both of <strong>&gt;zlib-1.2</strong> and 
			
 
				+          Make sure you have any of or all of <strong>&gt;zlib-1.2</strong>,
			
 
				+          <strong>&gt;gzip-1.2</strong>, <strong>&gt;bzip2-1.0</strong> and 
			
 
				           <strong>&gt;lzo2.0</strong> packages for your platform installed; 
			
 
				           depending on your needs.
			
 
				         </li>
			
@@ -172,10 +174,11 @@
 
				         
			
 
				         <ul>
			
 
				           <li>
			
 
				-            It is <strong>mandatory</strong> to have both the zlib and lzo 
			
 
				+            It is <strong>mandatory</strong> to have the 
			
 
				+            zlib, gzip, bzip2 and lzo 
			
 
				             development packages on the target platform for building the 
			
 
				             native hadoop library; however for deployment it is sufficient to 
			
 
				-            install zlib or lzo if you wish to use only one of them.
			
 
				+            install one of them if you wish to use only one of them.
			
 
				           </li>
			
 
				           <li>
			
 
				             It is necessary to have the correct 32/64 libraries of both zlib/lzo 
			
--- a/src/docs/src/documentation/content/xdocs/site.xml
+++ b/src/docs/src/documentation/content/xdocs/site.xml
@@ -74,6 +74,7 @@ See http://forrest.apache.org/docs/linking.html for more info.
 
				     <zlib      href="http://www.zlib.net/" />
			
 
				     <lzo       href="http://www.oberhumer.com/opensource/lzo/" />
			
 
				     <gzip      href="http://www.gzip.org/" />
			
 
				+    <bzip      href="http://www.bzip.org/" />
			
 
				     <cygwin    href="http://www.cygwin.com/" />
			
 
				     <osx       href="http://www.apple.com/macosx" />
			
 
				     <hod href="">
			
@@ -180,6 +181,7 @@ See http://forrest.apache.org/docs/linking.html for more info.
 
				                 <setprofileenabled href="#setProfileEnabled(boolean)" />
			
 
				                 <setprofiletaskrange href="#setProfileTaskRange(boolean,%20java.lang.String)" />
			
 
				                 <setprofileparams href="#setProfileParams(java.lang.String)" />
			
 
				+                <setnumtaskstoexecuteperjvm href="#setNumTasksToExecutePerJvm(int)" />
			
 
				                 <getjoblocaldir href="#getJobLocalDir()" />
			
 
				                 <getjar href="#getJar()" />
			
 
				               </jobconf>
			
@@ -225,6 +227,11 @@ See http://forrest.apache.org/docs/linking.html for more info.
 
				               <lib href="lib/">
			
 
				                 <package-summary href="package-summary.html" />
			
 
				                 <hashpartitioner href="HashPartitioner.html" />
			
 
				+                <keyfieldbasedpartitioner href="KeyFieldBasedPartitioner.html" />
			
 
				+                <keyfieldbasedcomparator href="KeyFieldBasedComparator.html" />
			
 
				+                <aggregate href="aggregate/">
			
 
				+                  <package-summary href="package-summary.html" />
			
 
				+                </aggregate>
			
 
				               </lib>
			
 
				               <pipes href="pipes/">
			
 
				                 <package-summary href="package-summary.html" />
			
--- a/src/docs/src/documentation/content/xdocs/streaming.xml
+++ b/src/docs/src/documentation/content/xdocs/streaming.xml
@@ -312,13 +312,20 @@ In the above example, "-D stream.map.output.field.separator=." specifies "." as
 
				 </p><p>

			
 
				 Similarly, you can use "-D stream.reduce.output.field.separator=SEP" and "-D stream.num.reduce.output.fields=NUM" to specify the nth field separator in a line of the reduce outputs as the separator between the key and the value.

			
 
				 </p>

			
 
				+<p> Similarly, you can specify "stream.map.input.field.separator" and 

			
 
				+"stream.reduce.input.field.separator" as the input separator for map/reduce 

			
 
				+inputs. By default the separator is the tab character.</p>

			
 
				 </section>

			
 
				 

			
 
				 

			
 
				 <section>

			
 
				 <title>A Useful Partitioner Class (secondary sort, the -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner option) </title>

			
 
				 <p>

			
 
				-Hadoop has a library class, org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner, that is useful for many applications. This class allows the Map/Reduce framework to partition the map outputs based on prefixes of keys, not the whole keys. For example:

			
 
				+Hadoop has a library class, 

			
 
				+<a href="ext:api/org/apache/hadoop/mapred/lib/keyfieldbasedpartitioner">KeyFieldBasedPartitioner</a>, 

			
 
				+that is useful for many applications. This class allows the Map/Reduce 

			
 
				+framework to partition the map outputs based on certain key fields, not

			
 
				+the whole keys. For example:

			
 
				 </p>

			
 
				 <source>

			
 
				 $HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
@@ -330,13 +337,19 @@ $HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
 
				     -D stream.map.output.field.separator=. \

			
 
				     -D stream.num.map.output.key.fields=4 \

			
 
				     -D map.output.key.field.separator=. \

			
 
				-    -D num.key.fields.for.partition=2 \

			
 
				+    -D mapred.text.key.partitioner.options=-k1,2\

			
 
				     -D mapred.reduce.tasks=12

			
 
				 </source>

			
 
				 <p>

			
 
				 Here, <em>-D stream.map.output.field.separator=.</em> and <em>-D stream.num.map.output.key.fields=4</em> are as explained in previous example. The two variables are used by streaming to identify the key/value pair of mapper. 

			
 
				 </p><p>

			
 
				-The map output keys of the above Map/Reduce job normally have four fields separated by ".". However, the Map/Reduce framework will partition the map outputs by the first two fields of the keys using the <em>-D num.key.fields.for.partition=2</em> option. Here, <em>-D map.output.key.field.separator=.</em> specifies the separator for the partition. This guarantees that all the key/value pairs with the same first two fields in the keys will be partitioned into the same reducer.

			
 
				+The map output keys of the above Map/Reduce job normally have four fields

			
 
				+separated by ".". However, the Map/Reduce framework will partition the map

			
 
				+outputs by the first two fields of the keys using the 

			
 
				+<em>-D mapred.text.key.partitioner.options=-k1,2</em> option. 

			
 
				+Here, <em>-D map.output.key.field.separator=.</em> specifies the separator 

			
 
				+for the partition. This guarantees that all the key/value pairs with the 

			
 
				+same first two fields in the keys will be partitioned into the same reducer.

			
 
				 </p><p>

			
 
				 <em>This is effectively equivalent to specifying the first two fields as the primary key and the next two fields as the secondary. The primary key is used for partitioning, and the combination of the primary and secondary keys is used for sorting.</em> A simple illustration is shown here:

			
 
				 </p>

			
@@ -370,13 +383,61 @@ Sorting within each partition for the reducer(all 4 fields used for sorting)</p>
 
				 11.14.2.3

			
 
				 </source>

			
 
				 </section>

			
 
				+<section>

			
 
				+<title>A Useful Comparator Class</title>

			
 
				+<p>

			
 
				+Hadoop has a library class, 

			
 
				+<a href="ext:api/org/apache/hadoop/mapred/lib/keyfieldbasedcomparator">KeyFieldBasedComparator</a>, 

			
 
				+that is useful for many applications. This class provides a subset of features

			
 
				+provided by the Unix/GNU Sort. For example:

			
 
				+</p>

			
 
				+<source>

			
 
				+$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				+    -input myInputDirs \

			
 
				+    -output myOutputDir \

			
 
				+    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \

			
 
				+    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \

			
 
				+    -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \

			
 
				+    -D stream.map.output.field.separator=. \

			
 
				+    -D stream.num.map.output.key.fields=4 \

			
 
				+    -D map.output.key.field.separator=. \

			
 
				+    -D mapred.text.key.comparator.options=-k2,2nr\

			
 
				+    -D mapred.reduce.tasks=12

			
 
				+</source>

			
 
				+<p>

			
 
				+The map output keys of the above Map/Reduce job normally have four fields

			
 
				+separated by ".". However, the Map/Reduce framework will sort the 

			
 
				+outputs by the second field of the keys using the 

			
 
				+<em>-D mapred.text.key.comparator.options=-k2,2nr</em> option. 

			
 
				+Here, <em>-n</em> specifies that the sorting is numerical sorting and 

			
 
				+<em>-r</em> specifies that the result should be reversed. A simple illustration

			
 
				+is shown below:

			
 
				+</p>

			
 
				+<p>

			
 
				+Output of map (the keys)</p>

			
 
				+<source>

			
 
				+11.12.1.2

			
 
				+11.14.2.3

			
 
				+11.11.4.1

			
 
				+11.12.1.1

			
 
				+11.14.2.2

			
 
				+</source>

			
 
				+<p>

			
 
				+Sorting output for the reducer(where second field used for sorting)</p>

			
 
				+<source>

			
 
				+11.14.2.3

			
 
				+11.14.2.2

			
 
				+11.12.1.2

			
 
				+11.12.1.1

			
 
				+11.11.4.1

			
 
				+</source>

			
 
				+</section>

			
 
				 

			
 
				 <section>

			
 
				 <title>Working with the Hadoop Aggregate Package (the -reduce aggregate option) </title>

			
 
				 <p>

			
 
				-Hadoop has a library package called "Aggregate" (

			
 
				-<a href="https://svn.apache.org/repos/asf/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/lib/aggregate">

			
 
				-https://svn.apache.org/repos/asf/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/lib/aggregate</a>).

			
 
				+Hadoop has a library package called 

			
 
				+<a href="ext:api/org/apache/hadoop/mapred/lib/aggregate/package-summary">Aggregate</a>.

			
 
				 Aggregate provides a special reducer class and a special combiner class, and

			
 
				 a list of simple aggregators that perform aggregations such as "sum", "max",

			
 
				 "min" and so on  over a sequence of values. Aggregate allows you to define a

			
@@ -434,7 +495,7 @@ $HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
 
				     -reducer org.apache.hadoop.mapred.lib.FieldSelectionMapReduce\

			
 
				     -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \

			
 
				     -D map.output.key.field.separa=. \

			
 
				-    -D num.key.fields.for.partition=2 \

			
 
				+    -D mapred.text.key.partitioner.options=-k1,2 \

			
 
				     -D mapred.data.field.separator=. \

			
 
				     -D map.output.key.value.fields.spec=6,5,1-3:0- \

			
 
				     -D reduce.output.key.value.fields.spec=0-2:5- \

			
@@ -444,7 +505,11 @@ $HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
 
				 The option "-D map.output.key.value.fields.spec=6,5,1-3:0-" specifies key/value selection for the map outputs. Key selection spec and value selection spec are separated by ":". In this case, the map output key will consist of fields 6, 5, 1, 2, and 3. The map output value will consist of all fields (0- means field 0 and all 

			
 
				 the subsequent fields). 

			
 
				 </p><p>

			
 
				-The option "-D reduce.output.key.value.fields.spec=0-2:0-" specifies key/value selection for the reduce outputs. In this case, the reduce output key will consist of fields 0, 1, 2 (corresponding to the original fields 6, 5, 1). The reduce output value will consist of all fields starting from field 5 (corresponding to all the original fields).  

			
 
				+The option "-D reduce.output.key.value.fields.spec=0-2:5-" specifies 

			
 
				+key/value selection for the reduce outputs. In this case, the reduce 

			
 
				+output key will consist of fields 0, 1, 2 (corresponding to the original 

			
 
				+fields 6, 5, 1). The reduce output value will consist of all fields starting

			
 
				+from field 5 (corresponding to all the original fields).  

			
 
				 </p>

			
 
				 </section>

			
 
				 </section>

			
--- a/src/mapred/org/apache/hadoop/mapred/package.html
+++ b/src/mapred/org/apache/hadoop/mapred/package.html
@@ -172,8 +172,8 @@ public class Grep extends Configured implements Tool {
 
				     
			
 
				     grepJob.setJobName("grep");
			
 
				 
			
 
				-    grepJob.setInputPath(new Path(args[0]));
			
 
				-    grepJob.setOutputPath(args[1]);
			
 
				+    FileInputFormat.setInputPaths(grepJob, new Path(args[0]));
			
 
				+    FileOutputFormat.setOutputPath(grepJob, args[1]);
			
 
				 
			
 
				     grepJob.setMapperClass(GrepMapper.class);
			
 
				     grepJob.setCombinerClass(GrepReducer.class);
			
--- a/src/mapred/org/apache/hadoop/mapred/pipes/package.html
+++ b/src/mapred/org/apache/hadoop/mapred/pipes/package.html
@@ -117,5 +117,11 @@ defined, the Java one will be used. The partition function will be
 
				 called by the C++ framework before the key/value pair is sent back to
			
 
				 Java.
			
 
				 
			
 
				+<p>
			
 
				+
			
 
				+The application programs can also register counters with a group and a name 
			
 
				+and also increment the counters and get the counter values. Word-count 
			
 
				+example illustrating pipes usage with counters is available at 
			
 
				+<a href="https://svn.apache.org/repos/asf/hadoop/core/trunk/src/examples/pipes/impl/wordcount-simple.cc">wordcount-simple.cc</a>
			
 
				 </body>
			
 
				 </html>