16 years ago · af8338958a
--- a/.eclipse.templates/.launches/AllTests.launch
+++ b/.eclipse.templates/.launches/AllTests.launch
@@ -0,0 +1,28 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<launchConfiguration type="org.eclipse.jdt.junit.launchconfig">
			
 
				+<stringAttribute key="bad_container_name" value="/@PROJECT@/.l"/>
			
 
				+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
			
 
				+<listEntry value="/@PROJECT@"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
			
 
				+<listEntry value="4"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.debug.ui.favoriteGroups">
			
 
				+<listEntry value="org.eclipse.debug.ui.launchGroup.debug"/>
			
 
				+<listEntry value="org.eclipse.debug.ui.launchGroup.run"/>
			
 
				+</listAttribute>
			
 
				+<stringAttribute key="org.eclipse.jdt.junit.CONTAINER" value="=@PROJECT@"/>
			
 
				+<booleanAttribute key="org.eclipse.jdt.junit.KEEPRUNNING_ATTR" value="false"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.junit.TESTNAME" value=""/>
			
 
				+<stringAttribute key="org.eclipse.jdt.junit.TEST_KIND" value="org.eclipse.jdt.junit.loader.junit4"/>
			
 
				+<listAttribute key="org.eclipse.jdt.launching.CLASSPATH">
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry containerPath=&quot;org.eclipse.jdt.launching.JRE_CONTAINER&quot; javaProject=&quot;@PROJECT@&quot; path=&quot;1&quot; type=&quot;4&quot;/&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/build}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/build/classes}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.defaultClasspath&quot;&gt;&#10;&lt;memento exportedEntriesOnly=&quot;false&quot; project=&quot;@PROJECT@&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+</listAttribute>
			
 
				+<booleanAttribute key="org.eclipse.jdt.launching.DEFAULT_CLASSPATH" value="false"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value=""/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="@PROJECT@"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xms256m -Xmx512m -Dtest.build.data=${workspace_loc:@PROJECT@}/build/test -Dtest.cache.data=${workspace_loc:@PROJECT@}/build/test/cache -Dtest.debug.data=${workspace_loc:@PROJECT@}/build/test/debug -Dhadoop.log.dir=${workspace_loc:@PROJECT@}/build/test/log -Dtest.src.dir=${workspace_loc:@PROJECT@}/build/test/src -Dtest.build.extraconf=${workspace_loc:@PROJECT@}/build/test/extraconf -Dhadoop.policy.file=hadoop-policy.xml"/>
			
 
				+</launchConfiguration>
			
--- a/.eclipse.templates/.launches/DataNode.launch
+++ b/.eclipse.templates/.launches/DataNode.launch
@@ -0,0 +1,24 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication">
			
 
				+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
			
 
				+<listEntry value="/@PROJECT@/src/hdfs/org/apache/hadoop/hdfs/server/datanode/DataNode.java"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
			
 
				+<listEntry value="1"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.debug.ui.favoriteGroups">
			
 
				+<listEntry value="org.eclipse.debug.ui.launchGroup.run"/>
			
 
				+<listEntry value="org.eclipse.debug.ui.launchGroup.debug"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.jdt.launching.CLASSPATH">
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry containerPath=&quot;org.eclipse.jdt.launching.JRE_CONTAINER&quot; javaProject=&quot;@PROJECT@&quot; path=&quot;1&quot; type=&quot;4&quot;/&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/conf}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/build/classes}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/build}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.defaultClasspath&quot;&gt;&#10;&lt;memento exportedEntriesOnly=&quot;false&quot; project=&quot;@PROJECT@&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+</listAttribute>
			
 
				+<booleanAttribute key="org.eclipse.jdt.launching.DEFAULT_CLASSPATH" value="false"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.hadoop.hdfs.server.datanode.DataNode"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="@PROJECT@"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xmx1000m -Dhadoop.root.logger=INFO,console -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=${workspace_loc:@PROJECT@}"/>
			
 
				+</launchConfiguration>
			
--- a/.eclipse.templates/.launches/NameNode.launch
+++ b/.eclipse.templates/.launches/NameNode.launch
@@ -0,0 +1,24 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication">
			
 
				+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
			
 
				+<listEntry value="/@PROJECT@/src/hdfs/org/apache/hadoop/hdfs/server/namenode/NameNode.java"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
			
 
				+<listEntry value="1"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.debug.ui.favoriteGroups">
			
 
				+<listEntry value="org.eclipse.debug.ui.launchGroup.run"/>
			
 
				+<listEntry value="org.eclipse.debug.ui.launchGroup.debug"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.jdt.launching.CLASSPATH">
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry containerPath=&quot;org.eclipse.jdt.launching.JRE_CONTAINER&quot; javaProject=&quot;@PROJECT@&quot; path=&quot;1&quot; type=&quot;4&quot;/&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/conf}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/build/classes}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/build}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.defaultClasspath&quot;&gt;&#10;&lt;memento exportedEntriesOnly=&quot;false&quot; project=&quot;@PROJECT@&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+</listAttribute>
			
 
				+<booleanAttribute key="org.eclipse.jdt.launching.DEFAULT_CLASSPATH" value="false"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.hadoop.hdfs.server.namenode.NameNode"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="@PROJECT@"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xmx1000m -Dhadoop.root.logger=INFO,console -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=${workspace_loc:@PROJECT@}"/>
			
 
				+</launchConfiguration>
			
--- a/.eclipse.templates/.launches/SpecificTestTemplate.launch
+++ b/.eclipse.templates/.launches/SpecificTestTemplate.launch
@@ -0,0 +1,28 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<launchConfiguration type="org.eclipse.jdt.junit.launchconfig">
			
 
				+<stringAttribute key="bad_container_name" value="/@PROJECT@/.l"/>
			
 
				+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
			
 
				+<listEntry value="/@PROJECT@"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
			
 
				+<listEntry value="4"/>
			
 
				+</listAttribute>
			
 
				+<listAttribute key="org.eclipse.debug.ui.favoriteGroups">
			
 
				+<listEntry value="org.eclipse.debug.ui.launchGroup.run"/>
			
 
				+<listEntry value="org.eclipse.debug.ui.launchGroup.debug"/>
			
 
				+</listAttribute>
			
 
				+<stringAttribute key="org.eclipse.jdt.junit.CONTAINER" value=""/>
			
 
				+<booleanAttribute key="org.eclipse.jdt.junit.KEEPRUNNING_ATTR" value="false"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.junit.TESTNAME" value=""/>
			
 
				+<stringAttribute key="org.eclipse.jdt.junit.TEST_KIND" value="org.eclipse.jdt.junit.loader.junit4"/>
			
 
				+<listAttribute key="org.eclipse.jdt.launching.CLASSPATH">
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry containerPath=&quot;org.eclipse.jdt.launching.JRE_CONTAINER&quot; javaProject=&quot;@PROJECT@&quot; path=&quot;1&quot; type=&quot;4&quot;/&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/build}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.variableClasspathEntry&quot;&gt;&#10;&lt;memento path=&quot;3&quot; variableString=&quot;${workspace_loc:@PROJECT@/build/classes}&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+<listEntry value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;runtimeClasspathEntry id=&quot;org.eclipse.jdt.launching.classpathentry.defaultClasspath&quot;&gt;&#10;&lt;memento exportedEntriesOnly=&quot;false&quot; project=&quot;@PROJECT@&quot;/&gt;&#10;&lt;/runtimeClasspathEntry&gt;&#10;"/>
			
 
				+</listAttribute>
			
 
				+<booleanAttribute key="org.eclipse.jdt.launching.DEFAULT_CLASSPATH" value="false"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.hadoop.TestNameHere"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="@PROJECT@"/>
			
 
				+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xms256m -Xmx512m -Dtest.build.data=${workspace_loc:@PROJECT@}/build/test -Dtest.cache.data=${workspace_loc:@PROJECT@}/build/test/cache -Dtest.debug.data=${workspace_loc:@PROJECT@}/build/test/debug -Dhadoop.log.dir=${workspace_loc:@PROJECT@}/build/test/log -Dtest.src.dir=${workspace_loc:@PROJECT@}/build/test/src -Dtest.build.extraconf=${workspace_loc:@PROJECT@}/build/test/extraconf -Dhadoop.policy.file=hadoop-policy.xml"/>
			
 
				+</launchConfiguration>
			
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@
 
				 *~
			
 
				 .classpath
			
 
				 .project
			
 
				+.launches/
			
 
				 .settings
			
 
				 .svn
			
 
				 build/
			
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -59,6 +59,22 @@ Append branch (unreleased changes)
 
				 
			
 
				 Trunk (unreleased changes)
			
 
				 
			
 
				+  INCOMPATIBLE CHANGES
			
 
				+
			
 
				+  NEW FEATURES
			
 
				+
			
 
				+  IMPROVEMENTS
			
 
				+
			
 
				+  OPTIMIZATIONS
			
 
				+
			
 
				+  BUG FIXES
			
 
				+
			
 
				+    HDFS-629. Remove ReplicationTargetChooser.java along with fixing 
			
 
				+    import warnings generated by Eclipse. (dhruba)
			
 
				+
			
 
				+
			
 
				+Release 0.21.0 - Unreleased
			
 
				+
			
 
				   INCOMPATIBLE CHANGES
			
 
				 
			
 
				     HDFS-538. Per the contract elucidated in HADOOP-6201, throw
			
@@ -97,6 +113,15 @@ Trunk (unreleased changes)
 
				     HDFS-385. Add support for an experimental API that allows a module external
			
 
				     to HDFS to specify how HDFS blocks should be placed. (dhruba)
			
 
				 
			
 
				+    HADOOP-4952. Update hadoop-core and test jars to propagate new FileContext
			
 
				+    file system application interface. (Sanjay Radia via suresh).
			
 
				+
			
 
				+    HDFS-567. Add block forensics contrib tool to print history of corrupt and
			
 
				+    missing blocks from the HDFS logs.
			
 
				+    (Bill Zeller, Jithendra Pandey via suresh).
			
 
				+
			
 
				+    HDFS-610. Support o.a.h.fs.FileContext.  (Sanjay Radia via szetszwo)
			
 
				+
			
 
				   IMPROVEMENTS
			
 
				 
			
 
				     HDFS-381. Remove blocks from DataNode maps when corresponding file
			
@@ -216,6 +241,11 @@ Trunk (unreleased changes)
 
				 
			
 
				     HDFS-618. Support non-recursive mkdir().  (Kan Zhang via szetszwo)
			
 
				 
			
 
				+    HDFS-574. Split the documentation between the subprojects.
			
 
				+    (Corinne Chandel via omalley)
			
 
				+
			
 
				+    HDFS-598. Eclipse launch task for HDFS. (Eli Collins via tomwhite)
			
 
				+
			
 
				   BUG FIXES
			
 
				 
			
 
				     HDFS-76. Better error message to users when commands fail because of 
			
@@ -306,7 +336,7 @@ Trunk (unreleased changes)
 
				 
			
 
				     HDFS-622. checkMinReplication should count live nodes only. (shv)
			
 
				 
			
 
				-Release 0.20.1 - Unreleased
			
 
				+Release 0.20.1 - 2009-09-01
			
 
				 
			
 
				   IMPROVEMENTS
			
 
				 
			
--- a/build.xml
+++ b/build.xml
@@ -27,9 +27,9 @@
 
				  
			
 
				   <property name="Name" value="Hadoop-Hdfs"/>
			
 
				   <property name="name" value="hadoop-hdfs"/>
			
 
				-  <property name="version" value="0.21.0-dev"/>
			
 
				-  <property name="hadoop-core.version" value="0.21.0-dev"/>
			
 
				-  <property name="hadoop-mr.version" value="0.21.0-dev"/>
			
 
				+  <property name="version" value="0.22.0-dev"/>
			
 
				+  <property name="hadoop-core.version" value="0.22.0-dev"/>
			
 
				+  <property name="hadoop-mr.version" value="0.22.0-dev"/>
			
 
				   <property name="final.name" value="${name}-${version}"/>
			
 
				   <property name="test.hdfs.final.name" value="${name}-test-${version}"/>
			
 
				   <property name="test.hdfswithmr.final.name" value="${name}-hdfswithmr-test-${version}"/>
			
@@ -43,7 +43,6 @@
 
				   <property name="conf.dir" value="${basedir}/conf"/>
			
 
				   <property name="contrib.dir" value="${basedir}/src/contrib"/>
			
 
				   <property name="docs.src" value="${basedir}/src/docs"/>
			
 
				-  <property name="src.docs.cn" value="${basedir}/src/docs/cn"/>
			
 
				   <property name="changes.src" value="${docs.src}/changes"/>
			
 
				 
			
 
				   <property name="build.dir" value="${basedir}/build"/>
			
@@ -63,7 +62,6 @@
 
				             value="${sun.arch.data.model}"/>
			
 
				 
			
 
				   <property name="build.docs" value="${build.dir}/docs"/>
			
 
				-  <property name="build.docs.cn" value="${build.dir}/docs/cn"/>
			
 
				   <property name="build.javadoc" value="${build.docs}/api"/>
			
 
				   <property name="build.javadoc.timestamp" value="${build.javadoc}/index.html" />
			
 
				   <property name="build.javadoc.dev" value="${build.docs}/dev-api"/>
			
@@ -821,22 +819,6 @@
 
				     <style basedir="${hdfs.src.dir}" destdir="${build.docs}"
			
 
				            includes="hdfs-default.xml" style="conf/configuration.xsl"/>
			
 
				     <antcall target="changes-to-html"/>
			
 
				-    <antcall target="cn-docs"/>
			
 
				-  </target>
			
 
				-
			
 
				-  <target name="cn-docs" depends="forrest.check, init" 
			
 
				-       description="Generate forrest-based Chinese documentation. To use, specify -Dforrest.home=&lt;base of Apache Forrest installation&gt; on the command line." 
			
 
				-        if="forrest.home">
			
 
				-    <exec dir="${src.docs.cn}" executable="${forrest.home}/bin/forrest" failonerror="true">
			
 
				-      <env key="LANG" value="en_US.utf8"/>
			
 
				-      <env key="JAVA_HOME" value="${java5.home}"/>
			
 
				-    </exec>
			
 
				-    <copy todir="${build.docs.cn}">
			
 
				-      <fileset dir="${src.docs.cn}/build/site/" />
			
 
				-    </copy>
			
 
				-    <style basedir="${hdfs.src.dir}" destdir="${build.docs.cn}"
			
 
				-           includes="hdfs-default.xml" style="conf/configuration.xsl"/>
			
 
				-    <antcall target="changes-to-html"/>
			
 
				   </target>
			
 
				 
			
 
				   <target name="forrest.check" unless="forrest.home" depends="java5.check">
			
@@ -1150,7 +1132,6 @@
 
				     <delete dir="${build.dir}"/>
			
 
				     <delete dir="${build-fi.dir}"/>
			
 
				     <delete dir="${docs.src}/build"/>
			
 
				-    <delete dir="${src.docs.cn}/build"/>
			
 
				   </target>
			
 
				 
			
 
				   <!-- ================================================================== -->
			
--- a/lib/hadoop-core-0.22.0-dev.jar
+++ b/lib/hadoop-core-0.22.0-dev.jar
--- a/lib/hadoop-core-test-0.21.0-dev.jar
+++ b/lib/hadoop-core-test-0.21.0-dev.jar
--- a/lib/hadoop-core-test-0.22.0-dev.jar
+++ b/lib/hadoop-core-test-0.22.0-dev.jar
--- a/lib/hadoop-mapred-0.22.0-dev.jar
+++ b/lib/hadoop-mapred-0.22.0-dev.jar
--- a/lib/hadoop-mapred-examples-0.22.0-dev.jar
+++ b/lib/hadoop-mapred-examples-0.22.0-dev.jar
--- a/lib/hadoop-mapred-test-0.22.0-dev.jar
+++ b/lib/hadoop-mapred-test-0.22.0-dev.jar
--- a/src/contrib/block_forensics/README
+++ b/src/contrib/block_forensics/README
@@ -0,0 +1,25 @@
 
				+This contribution consists of two components designed to make it easier to find information about lost or corrupt blocks.
			
 
				+
			
 
				+The first is a map reduce designed to search for one or more block ids in a set of log files. It exists in org.apache.hadoop.block_forensics.BlockSearch. Building this contribution generates a jar file that can be executed using:
			
 
				+
			
 
				+  bin/hadoop jar [jar location] [hdfs input path] [hdfs output dir] [comma delimited list of block ids]
			
 
				+
			
 
				+  For example, the command:
			
 
				+    bin/hadoop jar /foo/bar/hadoop-0.1-block_forensics.jar /input/* /ouptut 2343,45245,75823
			
 
				+  ... searches for any of blocks 2343, 45245, or 75823 in any of the files 
			
 
				+   contained in the /input/ directory. 
			
 
				+
			
 
				+  
			
 
				+  The output will be any line containing one of the provided block ids. While this tool is designed to be used with block ids, it can also be used for general text searching. 
			
 
				+  
			
 
				+The second component is a standalone java program that will repeatedly query the namenode at a given interval looking for corrupt replicas. If it finds a corrupt replica, it will launch the above map reduce job. The syntax is:
			
 
				+
			
 
				+  java BlockForensics http://[namenode]:[port]/corrupt_replicas_xml.jsp [sleep time between namenode query for corrupt blocks (in milliseconds)] [mapred jar location] [hdfs input path]
			
 
				+
			
 
				+  For example, the command:
			
 
				+    java BlockForensics http://localhost:50070/corrupt_replicas_xml.jsp 30000
			
 
				+                        /foo/bar/hadoop-0.1-block_forensics.jar /input/*
			
 
				+  ... queries the namenode at localhost:50070 for corrupt replicas every 30
			
 
				+      seconds and runs /foo/bar/hadoop-0.1-block_forensics.jar if any are found. 
			
 
				+
			
 
				+The map reduce job jar and the BlockForensics class can be found in your build/contrib/block_forensics and build/contrib/block_forensics/classes directories, respectively. 
			
--- a/src/contrib/block_forensics/build.xml
+++ b/src/contrib/block_forensics/build.xml
@@ -0,0 +1,66 @@
 
				+<?xml version="1.0"?>
			
 
				+
			
 
				+<!--
			
 
				+   Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				+   contributor license agreements.  See the NOTICE file distributed with
			
 
				+   this work for additional information regarding copyright ownership.
			
 
				+   The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				+   (the "License"); you may not use this file except in compliance with
			
 
				+   the License.  You may obtain a copy of the License at
			
 
				+
			
 
				+       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+   Unless required by applicable law or agreed to in writing, software
			
 
				+   distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+   See the License for the specific language governing permissions and
			
 
				+   limitations under the License.
			
 
				+-->
			
 
				+
			
 
				+<!-- 
			
 
				+Before you can run these subtargets directly, you need 
			
 
				+to call at top-level: ant deploy-contrib compile-core-test
			
 
				+-->
			
 
				+<project name="block_forensics" default="jar">
			
 
				+  <property name="version" value="0.1"/>
			
 
				+  <import file="../build-contrib.xml"/>
			
 
				+
			
 
				+  <!-- create the list of files to add to the classpath -->
			
 
				+  <fileset dir="${hadoop.root}/lib" id="class.path">
			
 
				+    <include name="**/*.jar" />
			
 
				+    <exclude name="**/excluded/" />
			
 
				+  </fileset>
			
 
				+
			
 
				+  <!-- Override jar target to specify main class -->
			
 
				+  <target name="jar" depends="compile">
			
 
				+    <jar
			
 
				+      jarfile="${build.dir}/hadoop-${version}-${name}.jar"
			
 
				+      basedir="${build.classes}"      
			
 
				+    >
			
 
				+    <manifest>
			
 
				+      <attribute name="Main-Class" value="org.apache.hadoop.blockforensics.BlockSearch"/>
			
 
				+    </manifest>
			
 
				+    </jar>
			
 
				+    
			
 
				+    <javac srcdir="client" destdir="${build.classes}"/>
			
 
				+
			
 
				+  </target>
			
 
				+
			
 
				+  <!-- Run only pure-Java unit tests. superdottest -->
			
 
				+  <target name="test">
			
 
				+   <antcall target="hadoopbuildcontrib.test"> 
			
 
				+   </antcall>
			
 
				+  </target>  
			
 
				+ 
			
 
				+  <!-- Run all unit tests
			
 
				+  This is not called as part of the nightly build
			
 
				+  because it will only run on platforms that have standard 
			
 
				+  Unix utilities available. 
			
 
				+  -->
			
 
				+ <target name="test-unix">
			
 
				+   <antcall target="hadoopbuildcontrib.test">
			
 
				+   </antcall>
			
 
				+ </target>  
			
 
				+
			
 
				+
			
 
				+</project>
			
--- a/src/contrib/block_forensics/client/BlockForensics.java
+++ b/src/contrib/block_forensics/client/BlockForensics.java
@@ -0,0 +1,186 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+ 
			
 
				+import java.io.BufferedReader;
			
 
				+import java.io.File;
			
 
				+import java.io.IOException;
			
 
				+import java.io.InputStream;
			
 
				+import java.io.InputStreamReader;
			
 
				+import java.lang.Runtime;
			
 
				+import java.net.URL;
			
 
				+import java.net.URLConnection;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Iterator;
			
 
				+import java.util.LinkedList;
			
 
				+import java.util.List;
			
 
				+import java.util.Random;
			
 
				+import java.util.Set;
			
 
				+import java.util.StringTokenizer;
			
 
				+import java.util.TreeSet;
			
 
				+import javax.xml.parsers.DocumentBuilder;
			
 
				+import javax.xml.parsers.DocumentBuilderFactory;
			
 
				+import javax.xml.parsers.ParserConfigurationException;
			
 
				+import org.w3c.dom.Document;
			
 
				+import org.w3c.dom.NodeList;
			
 
				+import org.xml.sax.SAXException;
			
 
				+
			
 
				+/**
			
 
				+ * This class repeatedly queries a namenode looking for corrupt replicas. If 
			
 
				+ * any are found a provided hadoop job is launched and the output printed
			
 
				+ * to stdout. 
			
 
				+ *
			
 
				+ * The syntax is:
			
 
				+ *
			
 
				+ * java BlockForensics http://[namenode]:[port]/corrupt_replicas_xml.jsp 
			
 
				+ *                    [sleep time between namenode query for corrupt blocks
			
 
				+ *                      (in seconds)] [mapred jar location] [hdfs input path]
			
 
				+ *
			
 
				+ * All arguments are required.
			
 
				+ */
			
 
				+public class BlockForensics {
			
 
				+  
			
 
				+  public static String join(List<?> l, String sep) {
			
 
				+    StringBuilder sb = new StringBuilder();
			
 
				+    Iterator it = l.iterator();
			
 
				+    
			
 
				+    while(it.hasNext()){
			
 
				+      sb.append(it.next());
			
 
				+      if (it.hasNext()) {
			
 
				+        sb.append(sep);
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    return sb.toString();
			
 
				+  }
			
 
				+  
			
 
				+  
			
 
				+  // runs hadoop command and prints output to stdout
			
 
				+  public static void runHadoopCmd(String ... args)
			
 
				+  throws IOException {
			
 
				+    String hadoop_home = System.getenv("HADOOP_HOME");
			
 
				+    
			
 
				+    List<String> l = new LinkedList<String>();
			
 
				+    l.add("bin/hadoop");
			
 
				+    l.addAll(Arrays.asList(args));
			
 
				+    
			
 
				+    ProcessBuilder pb = new ProcessBuilder(l);
			
 
				+    
			
 
				+    if (hadoop_home != null) {
			
 
				+      pb.directory(new File(hadoop_home));
			
 
				+    }
			
 
				+
			
 
				+    pb.redirectErrorStream(true);
			
 
				+          
			
 
				+    Process p = pb.start();
			
 
				+
			
 
				+    BufferedReader br = new BufferedReader(
			
 
				+                          new InputStreamReader(p.getInputStream()));
			
 
				+    String line;
			
 
				+
			
 
				+    while ((line = br.readLine()) != null) {
			
 
				+      System.out.println(line);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+    
			
 
				+  public static void main(String[] args)
			
 
				+    throws SAXException, ParserConfigurationException, 
			
 
				+           InterruptedException, IOException {
			
 
				+
			
 
				+    if (System.getenv("HADOOP_HOME") == null) {
			
 
				+      System.err.println("The environmental variable HADOOP_HOME is undefined");
			
 
				+      System.exit(1);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    if (args.length < 4) {
			
 
				+      System.out.println("Usage: java BlockForensics [http://namenode:port/"
			
 
				+                         + "corrupt_replicas_xml.jsp] [sleep time between "
			
 
				+                         + "requests (in milliseconds)] [mapred jar location] "
			
 
				+                         + "[hdfs input path]");
			
 
				+      return;
			
 
				+    }
			
 
				+             
			
 
				+    int sleepTime = 30000;
			
 
				+  
			
 
				+    try {
			
 
				+      sleepTime = Integer.parseInt(args[1]);
			
 
				+    } catch (NumberFormatException e) {
			
 
				+      System.out.println("The sleep time entered is invalid, "
			
 
				+                         + "using default value: "+sleepTime+"ms"); 
			
 
				+    }
			
 
				+      
			
 
				+    Set<Long> blockIds = new TreeSet<Long>();
			
 
				+    
			
 
				+    while (true) {
			
 
				+      InputStream xml = new URL(args[0]).openConnection().getInputStream();
			
 
				+    
			
 
				+      DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
			
 
				+      DocumentBuilder builder = fact.newDocumentBuilder();
			
 
				+      Document doc = builder.parse(xml);
			
 
				+         
			
 
				+      NodeList corruptReplicaNodes = doc.getElementsByTagName("block_id");
			
 
				+
			
 
				+      List<Long> searchBlockIds = new LinkedList<Long>();
			
 
				+      for(int i=0; i<corruptReplicaNodes.getLength(); i++) {
			
 
				+        Long blockId = new Long(corruptReplicaNodes.item(i)
			
 
				+                                                    .getFirstChild()
			
 
				+                                                    .getNodeValue());
			
 
				+        if (!blockIds.contains(blockId)) {
			
 
				+          blockIds.add(blockId);
			
 
				+          searchBlockIds.add(blockId);
			
 
				+        }
			
 
				+      }
			
 
				+      
			
 
				+      if (searchBlockIds.size() > 0) {
			
 
				+        String blockIdsStr = BlockForensics.join(searchBlockIds, ",");
			
 
				+        System.out.println("\nSearching for: " + blockIdsStr);
			
 
				+        String tmpDir =
			
 
				+            new String("/tmp-block-forensics-" +
			
 
				+                Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
			
 
				+
			
 
				+        System.out.println("Using temporary dir: "+tmpDir);
			
 
				+
			
 
				+        // delete tmp dir
			
 
				+        BlockForensics.runHadoopCmd("fs", "-rmr", tmpDir);
			
 
				+      
			
 
				+        // launch mapred job      
			
 
				+        BlockForensics.runHadoopCmd("jar",
			
 
				+                                    args[2], // jar location
			
 
				+                                    args[3], // input dir
			
 
				+                                    tmpDir, // output dir
			
 
				+                                    blockIdsStr// comma delimited list of blocks
			
 
				+                                    );
			
 
				+        // cat output
			
 
				+        BlockForensics.runHadoopCmd("fs", "-cat", tmpDir+"/part*");
			
 
				+
			
 
				+        // delete temp dir
			
 
				+        BlockForensics.runHadoopCmd("fs", "-rmr", tmpDir);
			
 
				+
			
 
				+        int sleepSecs = (int)(sleepTime/1000.);
			
 
				+        System.out.print("Sleeping for "+sleepSecs
			
 
				+                         + " second"+(sleepSecs == 1?"":"s")+".");
			
 
				+      }
			
 
				+
			
 
				+      System.out.print(".");
			
 
				+      Thread.sleep(sleepTime);
			
 
				+
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/block_forensics/ivy.xml
+++ b/src/contrib/block_forensics/ivy.xml
@@ -0,0 +1,44 @@
 
				+<?xml version="1.0" ?>
			
 
				+
			
 
				+<!--
			
 
				+   Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				+   contributor license agreements.  See the NOTICE file distributed with
			
 
				+   this work for additional information regarding copyright ownership.
			
 
				+   The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				+   (the "License"); you may not use this file except in compliance with
			
 
				+   the License.  You may obtain a copy of the License at
			
 
				+
			
 
				+       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+   Unless required by applicable law or agreed to in writing, software
			
 
				+   distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+   See the License for the specific language governing permissions and
			
 
				+   limitations under the License.
			
 
				+-->
			
 
				+
			
 
				+<ivy-module version="1.0">
			
 
				+  <info organisation="org.apache.hadoop" module="${ant.project.name}">
			
 
				+    <license name="Apache 2.0"/>
			
 
				+    <ivyauthor name="Apache Hadoop Team" url="http://hadoop.apache.org"/>
			
 
				+    <description>
			
 
				+        Apache Hadoop
			
 
				+    </description>
			
 
				+  </info>
			
 
				+  <configurations defaultconfmapping="default">
			
 
				+    <!--these match the Maven configurations-->
			
 
				+    <conf name="default" extends="master,runtime"/>
			
 
				+    <conf name="master" description="contains the artifact but no dependencies"/>
			
 
				+    <conf name="runtime" description="runtime but not the artifact" />
			
 
				+
			
 
				+    <conf name="common" visibility="private" 
			
 
				+      extends="runtime"
			
 
				+      description="artifacts needed to compile/test the application"/>
			
 
				+    <conf name="test" visibility="private" extends="runtime"/>
			
 
				+  </configurations>
			
 
				+
			
 
				+  <publications>
			
 
				+    <!--get the artifact from our module name-->
			
 
				+    <artifact conf="master"/>
			
 
				+  </publications>
			
 
				+</ivy-module>
			
--- a/src/contrib/block_forensics/ivy/libraries.properties
+++ b/src/contrib/block_forensics/ivy/libraries.properties
@@ -0,0 +1,21 @@
 
				+# Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				+# contributor license agreements.  See the NOTICE file distributed with
			
 
				+# this work for additional information regarding copyright ownership.
			
 
				+# The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				+# (the "License"); you may not use this file except in compliance with
			
 
				+# the License.  You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+#This properties file lists the versions of the various artifacts used by thrifts.
			
 
				+#It drives ivy and the generation of a maven POM
			
 
				+
			
 
				+#Please list the dependencies name with version if they are different from the ones 
			
 
				+#listed in the global libraries.properties file (in alphabetical order)
			
--- a/src/contrib/block_forensics/src/java/org/apache/hadoop/block_forensics/BlockSearch.java
+++ b/src/contrib/block_forensics/src/java/org/apache/hadoop/block_forensics/BlockSearch.java
@@ -0,0 +1,136 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.blockforensics;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.Iterator;
			
 
				+import java.util.LinkedList;
			
 
				+import java.util.List;
			
 
				+import java.util.StringTokenizer;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.conf.Configured;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.io.IntWritable;
			
 
				+import org.apache.hadoop.io.LongWritable;
			
 
				+import org.apache.hadoop.io.Text;
			
 
				+import org.apache.hadoop.mapreduce.Job;
			
 
				+import org.apache.hadoop.mapreduce.Mapper;
			
 
				+import org.apache.hadoop.mapreduce.Reducer;
			
 
				+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
			
 
				+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
			
 
				+import org.apache.hadoop.util.Tool;
			
 
				+import org.apache.hadoop.util.ToolRunner;
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * BlockSearch is a mapred job that's designed to search input for appearances 
			
 
				+ * of strings. 
			
 
				+ *
			
 
				+ * The syntax is:
			
 
				+ * 
			
 
				+ * bin/hadoop jar [jar location] [hdfs input path] [hdfs output dir]
			
 
				+                  [comma delimited list of block ids]
			
 
				+ *
			
 
				+ * All arguments are required.
			
 
				+ *
			
 
				+ * This tool is designed to be used to search for one or more block ids in log
			
 
				+ * files but can be used for general text search, assuming the search strings
			
 
				+ * don't contain tokens. It assumes only one search string will appear per line. 
			
 
				+ */
			
 
				+public class BlockSearch extends Configured implements Tool {
			
 
				+  public static class Map extends Mapper<LongWritable, Text, Text, Text> {
			
 
				+    private Text blockIdText = new Text();
			
 
				+    private Text valText = new Text();
			
 
				+    private List<String> blockIds = null;
			
 
				+
			
 
				+    protected void setup(Context context) 
			
 
				+      throws IOException, InterruptedException {
			
 
				+      Configuration conf = context.getConfiguration();
			
 
				+      StringTokenizer st = new StringTokenizer(conf.get("blockIds"), ",");
			
 
				+      blockIds = new LinkedList<String>();
			
 
				+      while (st.hasMoreTokens()) {
			
 
				+        String blockId = st.nextToken();
			
 
				+        blockIds.add(blockId);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    public void map(LongWritable key, Text value, Context context)
			
 
				+      throws IOException, InterruptedException {
			
 
				+      if (blockIds == null) {
			
 
				+        System.err.println("Error: No block ids specified");
			
 
				+      } else {
			
 
				+        String valStr = value.toString();
			
 
				+
			
 
				+        for(String blockId: blockIds) {
			
 
				+          if (valStr.indexOf(blockId) != -1) {
			
 
				+            blockIdText.set(blockId);
			
 
				+            valText.set(valStr);
			
 
				+            context.write(blockIdText, valText);
			
 
				+            break; // assume only one block id appears per line
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+  public static class Reduce extends Reducer<Text, Text, Text, Text> {
			
 
				+    private Text val = new Text();
			
 
				+    public void reduce(Text key, Iterator<Text> values, Context context)
			
 
				+    throws IOException, InterruptedException {
			
 
				+      while (values.hasNext()) {
			
 
				+        context.write(key, values.next());
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+    
			
 
				+  public int run(String[] args) throws Exception {
			
 
				+    if (args.length < 3) {
			
 
				+      System.out.println("BlockSearch <inLogs> <outDir> <comma delimited list of blocks>");
			
 
				+      ToolRunner.printGenericCommandUsage(System.out);
			
 
				+      return 2;
			
 
				+    }
			
 
				+
			
 
				+    Configuration conf = getConf();
			
 
				+    conf.set("blockIds", args[2]);
			
 
				+
			
 
				+    Job job = new Job(conf);
			
 
				+
			
 
				+    job.setCombinerClass(Reduce.class);
			
 
				+    job.setJarByClass(BlockSearch.class);
			
 
				+    job.setJobName("BlockSearch");
			
 
				+    job.setMapperClass(Map.class);
			
 
				+    job.setOutputKeyClass(Text.class);
			
 
				+    job.setOutputValueClass(Text.class);
			
 
				+    job.setReducerClass(Reduce.class);
			
 
				+
			
 
				+    FileInputFormat.setInputPaths(job, new Path(args[0]));
			
 
				+    FileOutputFormat.setOutputPath(job, new Path(args[1]));
			
 
				+
			
 
				+    return job.waitForCompletion(true) ? 0 : 1;
			
 
				+  }
			
 
				+
			
 
				+  public static void main(String[] args) throws Exception {
			
 
				+    int res = ToolRunner.run(new Configuration(), new BlockSearch(), args);
			
 
				+    System.exit(res);
			
 
				+  }
			
 
				+}
			
--- a/src/docs/src/documentation/content/xdocs/SLG_user_guide.xml
+++ b/src/docs/src/documentation/content/xdocs/SLG_user_guide.xml
@@ -18,12 +18,12 @@
 
				 <!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				 <document>
			
 
				 	<header>
			
 
				-		<title> HDFS Synthetic Load Generator Guide </title>
			
 
				+		<title>Synthetic Load Generator Guide </title>
			
 
				 	</header>
			
 
				 	<body>
			
 
				-		<section>
			
 
				-			<title> Description </title>
			
 
				-			<p>
			
 
				+	<section>
			
 
				+	<title>Overview</title>
			
 
				+		<p>
			
 
				         The synthetic load generator (SLG) is a tool for testing NameNode behavior
			
 
				         under different client loads. The user can generate different mixes 
			
 
				         of read, write, and list requests by specifying the probabilities of
			
@@ -33,91 +33,121 @@
 
				         monitor the running of the NameNode. When a load generator exits, it
			
 
				         prints some NameNode statistics like the average execution time of each
			
 
				         kind of operation and the NameNode throughput.
			
 
				-                       </p>
			
 
				-                </section>
			
 
				-		<section>
			
 
				-			<title> Synopsis </title>
			
 
				-			<p>
			
 
				-        <code>java LoadGenerator [options]</code><br/>
			
 
				-                        </p>
			
 
				-                        <p>
			
 
				-        Options include:<br/>
			
 
				-        <code>&nbsp;&nbsp;-readProbability &lt;read probability&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the probability of the read operation;
			
 
				-                default is 0.3333. </code><br/>
			
 
				-        <code>&nbsp;&nbsp;-writeProbability &lt;write probability&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the probability of the write 
			
 
				-                operations; default is 0.3333.</code><br/>
			
 
				-        <code>&nbsp;&nbsp;-root &lt;test space root&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the root of the test space;
			
 
				-                default is /testLoadSpace.</code><br/>
			
 
				-        <code>&nbsp;&nbsp;-maxDelayBetweenOps 
			
 
				-                &lt;maxDelayBetweenOpsInMillis&gt;</code><br/> 
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the maximum delay between two consecutive
			
 
				-                operations in a thread; default is 0 indicating no delay.
			
 
				-                </code><br/>
			
 
				-        <code>&nbsp;&nbsp;-numOfThreads &lt;numOfThreads&gt;</code><br/> 
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the number of threads to spawn; 
			
 
				-                default is 200.</code><br/>
			
 
				-        <code>&nbsp;&nbsp;-elapsedTime &lt;elapsedTimeInSecs&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the number of seconds that the program 
			
 
				-                will run; A value of zero indicates that the program runs
			
 
				-                forever. The default value is 0.</code><br/>
			
 
				-        <code>&nbsp;&nbsp;-startTime &lt;startTimeInMillis&gt;</code><br/> 
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the time that all worker threads 
			
 
				+        </p>
			
 
				+     </section>
			
 
				+                
			
 
				+	<section>
			
 
				+	<title> Synopsis </title>
			
 
				+	  <p>
			
 
				+        The synopsis of the command is:
			
 
				+      </p>
			
 
				+		<source>java LoadGenerator [options]</source>
			
 
				+        <p> Options include:</p>
			
 
				+        
			
 
				+    <ul>
			
 
				+    <li>
			
 
				+        <code>-readProbability &lt;read probability&gt;</code><br/>
			
 
				+        The probability of the read operation; default is 0.3333.
			
 
				+    </li>
			
 
				+ 
			
 
				+    <li>               
			
 
				+        <code>-writeProbability &lt;write probability&gt;</code><br/>
			
 
				+        The probability of the write operations; default is 0.3333.
			
 
				+    </li>
			
 
				+
			
 
				+   <li>            
			
 
				+        <code>-root &lt;test space root&gt;</code><br/>
			
 
				+        The root of the test space; default is /testLoadSpace.
			
 
				+    </li> 
			
 
				+
			
 
				+    <li>           
			
 
				+        <code>-maxDelayBetweenOps &lt;maxDelayBetweenOpsInMillis&gt;</code><br/> 
			
 
				+        The maximum delay between two consecutive operations in a thread; default is 0 indicating no delay.
			
 
				+    </li> 
			
 
				+
			
 
				+    <li>            
			
 
				+        <code>-numOfThreads &lt;numOfThreads&gt;</code><br/>
			
 
				+        The number of threads to spawn; default is 200.
			
 
				+    </li>
			
 
				+
			
 
				+     <li>          
			
 
				+        <code>-elapsedTime &lt;elapsedTimeInSecs&gt;</code><br/>
			
 
				+        The number of seconds that the program 
			
 
				+        will run; A value of zero indicates that the program runs
			
 
				+        forever. The default value is 0.
			
 
				+     </li> 
			
 
				+
			
 
				+    <li>            
			
 
				+        <code>-startTime &lt;startTimeInMillis&gt;</code><br/> 
			
 
				+        The time that all worker threads 
			
 
				                 start to run. By default it is 10 seconds after the main 
			
 
				                 program starts running.This creates a barrier if more than
			
 
				                 one load generator is running.
			
 
				-        </code><br/>
			
 
				-        <code>&nbsp;&nbsp;-seed &lt;seed&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the random generator seed for repeating 
			
 
				+      </li>
			
 
				+    
			
 
				+    <li>     
			
 
				+        <code>-seed &lt;seed&gt;</code><br/>
			
 
				+        The random generator seed for repeating 
			
 
				                 requests to NameNode when running with a single thread;
			
 
				-                default is the current time.</code><br/>
			
 
				-			</p>
			
 
				-			<p>
			
 
				+                default is the current time.
			
 
				+     </li>
			
 
				+			
			
 
				+	</ul>
			
 
				+			
			
 
				+	<p>
			
 
				         After command line argument parsing, the load generator traverses 
			
 
				         the test space and builds a table of all directories and another table
			
 
				         of all files in the test space. It then waits until the start time to
			
 
				-        spawn the number of worker threads as specified by the user. Each
			
 
				-        thread sends a stream of requests to NameNode. At each iteration, 
			
 
				+        spawn the number of worker threads as specified by the user. 
			
 
				+        
			
 
				+        Each thread sends a stream of requests to NameNode. At each iteration, 
			
 
				         it first decides if it is going to read a file, create a file, or
			
 
				         list a directory following the read and write probabilities specified
			
 
				         by the user. The listing probability is equal to 
			
 
				         <em>1-read probability-write probability</em>. When reading, 
			
 
				         it randomly picks a file in the test space and reads the entire file. 
			
 
				         When writing, it randomly picks a directory in the test space and 
			
 
				-        creates a file there. To avoid two threads with the same load 
			
 
				-        generator or from two different load generators create the same 
			
 
				+        creates a file there. 
			
 
				+    </p>
			
 
				+    <p>
			
 
				+        To avoid two threads with the same load 
			
 
				+        generator or from two different load generators creating the same 
			
 
				         file, the file name consists of the current machine's host name 
			
 
				         and the thread id. The length of the file follows Gaussian 
			
 
				         distribution with an average size of 2 blocks and the standard 
			
 
				-        deviation of 1. The new file is filled with byte 'a'. To avoid
			
 
				-        the test space to grow indefinitely, the file is deleted immediately
			
 
				-        after the file creation completes. While listing, it randomly 
			
 
				-        picks a directory in the test space and lists its content. 
			
 
				+        deviation of 1. The new file is filled with byte 'a'. To avoid the test 
			
 
				+        space growing indefinitely, the file is deleted immediately
			
 
				+        after the file creation completes. While listing, it randomly picks 
			
 
				+        a directory in the test space and lists its content. 
			
 
				+     </p>
			
 
				+     <p>   
			
 
				         After an operation completes, the thread pauses for a random 
			
 
				         amount of time in the range of [0, maxDelayBetweenOps] if the 
			
 
				         specified maximum delay is not zero. All threads are stopped when 
			
 
				         the specified elapsed time is passed. Before exiting, the program 
			
 
				         prints the average execution for each kind of NameNode operations, 
			
 
				         and the number of requests served by the NameNode per second.
			
 
				-                        </p>
			
 
				-                </section>
			
 
				-                <section>
			
 
				-                        <title> Test Space Population </title>
			
 
				-                        <p>
			
 
				-        The user needs to populate a test space before she runs a 
			
 
				+    </p>
			
 
				+    
			
 
				+     </section>
			
 
				+                
			
 
				+     <section>
			
 
				+     <title> Test Space Population </title>
			
 
				+     <p>
			
 
				+        The user needs to populate a test space before running a 
			
 
				         load generator. The structure generator generates a random 
			
 
				         test space structure and the data generator creates the files 
			
 
				         and directories of the test space in Hadoop distributed file system.
			
 
				-                        </p>
			
 
				-                        <section>
			
 
				-                                <title> Structure Generator </title>
			
 
				-                                <p>
			
 
				+     </p>
			
 
				+     
			
 
				+     <section>
			
 
				+     <title> Structure Generator </title>
			
 
				+    <p>
			
 
				         This tool generates a random namespace structure with the 
			
 
				         following constraints:
			
 
				-                                </p>
			
 
				-                                        <ol>
			
 
				+     </p>
			
 
				+     
			
 
				+     <ol>
			
 
				         <li>The number of subdirectories that a directory can have is 
			
 
				             a random number in [minWidth, maxWidth].</li>
			
 
				         <li>The maximum depth of each subdirectory is a random number 
			
@@ -125,69 +155,83 @@
 
				         <li>Files are randomly placed in leaf directories. The size of 
			
 
				             each file follows Gaussian distribution with an average size 
			
 
				             of 1 block and a standard deviation of 1.</li>
			
 
				-                                        </ol>
			
 
				-                                <p>
			
 
				+     </ol>
			
 
				+      <p>
			
 
				         The generated namespace structure is described by two files in 
			
 
				         the output directory. Each line of the first file contains the 
			
 
				         full name of a leaf directory. Each line of the second file 
			
 
				         contains the full name of a file and its size, separated by a blank.
			
 
				-                                </p>
			
 
				-                                <p>
			
 
				-        The synopsis of the command is
			
 
				-                                </p>
			
 
				-                                <p>
			
 
				-        <code>java StructureGenerator [options]</code>
			
 
				-                                </p>
			
 
				-                                <p>
			
 
				-        Options include:<br/>
			
 
				-        <code>&nbsp;&nbsp;-maxDepth &lt;maxDepth&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;maximum depth of the directory tree; 
			
 
				-                default is 5.</code><br/>
			
 
				-        <code>&nbsp;&nbsp;-minWidth &lt;minWidth&gt;</code><br/> 
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;minimum number of subdirectories per 
			
 
				-                directories; default is 1.</code><br/>
			
 
				-        <code>&nbsp;&nbsp;-maxWidth &lt;maxWidth&gt;</code><br/> 
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;maximum number of subdirectories per 
			
 
				-                directories; default is 5.</code><br/>
			
 
				-        <code>&nbsp;&nbsp;-numOfFiles &lt;#OfFiles&gt;</code><br/> 
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the total number of files in the test 
			
 
				-                space; default is 10.</code><br/>
			
 
				-        <code>&nbsp;&nbsp;-avgFileSize &lt;avgFileSizeInBlocks&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;average size of blocks; default is 1.
			
 
				-                </code><br/>
			
 
				-        <code>&nbsp;&nbsp;-outDir &lt;outDir&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;output directory; default is the 
			
 
				-                current directory. </code><br/>
			
 
				-        <code>&nbsp;&nbsp;-seed &lt;seed&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;random number generator seed; 
			
 
				-                default is the current time.</code><br/>
			
 
				-                                </p>
			
 
				-                        </section>
			
 
				-                        <section>
			
 
				-                                <title> Test Space Generator </title>
			
 
				-                                <p>
			
 
				+      </p>
			
 
				+      <p>
			
 
				+        The synopsis of the command is:
			
 
				+      </p>
			
 
				+      <source>java StructureGenerator [options]</source>
			
 
				+
			
 
				+     <p>Options include:</p>
			
 
				+     <ul>
			
 
				+     <li>
			
 
				+        <code>-maxDepth &lt;maxDepth&gt;</code><br/>
			
 
				+        Maximum depth of the directory tree; default is 5.
			
 
				+     </li>
			
 
				+
			
 
				+     <li>    
			
 
				+        <code>-minWidth &lt;minWidth&gt;</code><br/> 
			
 
				+        Minimum number of subdirectories per directories; default is 1.
			
 
				+     </li> 
			
 
				+
			
 
				+     <li>  
			
 
				+        <code>-maxWidth &lt;maxWidth&gt;</code><br/> 
			
 
				+        Maximum number of subdirectories per directories; default is 5.
			
 
				+      </li>
			
 
				+
			
 
				+     <li>           
			
 
				+        <code>-numOfFiles &lt;#OfFiles&gt;</code><br/> 
			
 
				+        The total number of files in the test space; default is 10.
			
 
				+      </li>
			
 
				+
			
 
				+     <li>          
			
 
				+        <code>-avgFileSize &lt;avgFileSizeInBlocks&gt;</code><br/>
			
 
				+        Average size of blocks; default is 1.
			
 
				+      </li> 
			
 
				+
			
 
				+     <li>           
			
 
				+        <code>-outDir &lt;outDir&gt;</code><br/>
			
 
				+        Output directory; default is the current directory.
			
 
				+     </li>
			
 
				+
			
 
				+     <li>           
			
 
				+        <code>-seed &lt;seed&gt;</code><br/>
			
 
				+        Random number generator seed; default is the current time.
			
 
				+    </li>            
			
 
				+     </ul>
			
 
				+     </section>
			
 
				+
			
 
				+    <section>
			
 
				+    <title>Data Generator </title>
			
 
				+         <p>
			
 
				         This tool reads the directory structure and file structure from 
			
 
				         the input directory and creates the namespace in Hadoop distributed
			
 
				         file system. All files are filled with byte 'a'.
			
 
				-                                </p>
			
 
				-                                <p>
			
 
				-        The synopsis of the command is
			
 
				-                                </p>
			
 
				-                                <p>
			
 
				-        <code>java DataGenerator [options]</code>
			
 
				-                                </p>
			
 
				-                                <p>
			
 
				-        Options include:<br/>
			
 
				-        <code>&nbsp;&nbsp;-inDir &lt;inDir&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;input directory name where directory/file
			
 
				-                structures are stored; default is the current directory.
			
 
				-        </code><br/>
			
 
				-        <code>&nbsp;&nbsp;-root &lt;test space root&gt;</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;the name of the root directory which the 
			
 
				-                new namespace is going to be placed under; 
			
 
				-                default is "/testLoadSpace".</code><br/>
			
 
				-                                </p>
			
 
				-		        </section>
			
 
				-                </section>
			
 
				+        </p>
			
 
				+         <p>
			
 
				+        The synopsis of the command is:
			
 
				+         </p>
			
 
				+         <source>java DataGenerator [options]</source>
			
 
				+         <p>Options include:</p>
			
 
				+         <ul>
			
 
				+    <li>
			
 
				+        <code>-inDir &lt;inDir&gt;</code><br/>
			
 
				+        Input directory name where directory/file
			
 
				+        structures are stored; default is the current directory.
			
 
				+    </li>
			
 
				+    <li>
			
 
				+        <code>-root &lt;test space root&gt;</code><br/>
			
 
				+        The name of the root directory which the 
			
 
				+        new namespace is going to be placed under; 
			
 
				+        default is "/testLoadSpace".
			
 
				+    </li>
			
 
				+     </ul>
			
 
				+	</section>
			
 
				+    </section>
			
 
				 	</body>
			
 
				 </document>
			
--- a/src/docs/src/documentation/content/xdocs/capacity_scheduler.xml
+++ b/src/docs/src/documentation/content/xdocs/capacity_scheduler.xml
@@ -1,356 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-<document>
			
 
				-  
			
 
				-  <header>
			
 
				-    <title>Capacity Scheduler Guide</title>
			
 
				-  </header>
			
 
				-  
			
 
				-  <body>
			
 
				-  
			
 
				-    <section>
			
 
				-      <title>Purpose</title>
			
 
				-      
			
 
				-      <p>This document describes the Capacity Scheduler, a pluggable 
			
 
				-      Map/Reduce scheduler for Hadoop which provides a way to share 
			
 
				-      large clusters.</p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Features</title>
			
 
				-      
			
 
				-      <p>The Capacity Scheduler supports the following features:</p> 
			
 
				-      <ul>
			
 
				-        <li>
			
 
				-          Support for multiple queues, where a job is submitted to a queue.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          Queues are allocated a fraction of the capacity of the grid in the 
			
 
				-          sense that a certain capacity of resources will be at their 
			
 
				-          disposal. All jobs submitted to a queue will have access to the 
			
 
				-          capacity allocated to the queue.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          Free resources can be allocated to any queue beyond it's capacity. 
			
 
				-          When there is demand for these resources from queues running below 
			
 
				-          capacity at a future point in time, as tasks scheduled on these 
			
 
				-          resources complete, they will be assigned to jobs on queues 
			
 
				-          running below the capacity.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          Queues optionally support job priorities (disabled by default).
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          Within a queue, jobs with higher priority will have access to the 
			
 
				-          queue's resources before jobs with lower priority. However, once a 
			
 
				-          job is running, it will not be preempted for a higher priority job,
			
 
				-          though new tasks from the higher priority job will be 
			
 
				-          preferentially scheduled.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          In order to prevent one or more users from monopolizing its 
			
 
				-          resources, each queue enforces a limit on the percentage of 
			
 
				-          resources allocated to a user at any given time, if there is 
			
 
				-          competition for them.  
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          Support for memory-intensive jobs, wherein a job can optionally 
			
 
				-          specify higher memory-requirements than the default, and the tasks 
			
 
				-          of the job will only be run on TaskTrackers that have enough memory 
			
 
				-          to spare.
			
 
				-        </li>
			
 
				-      </ul>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Picking a task to run</title>
			
 
				-      
			
 
				-      <p>Note that many of these steps can be, and will be, enhanced over time
			
 
				-      to provide better algorithms.</p>
			
 
				-      
			
 
				-      <p>Whenever a TaskTracker is free, the Capacity Scheduler picks 
			
 
				-      a queue which has most free space (whose ratio of # of running slots to 
			
 
				-      capacity is the lowest).</p>
			
 
				-      
			
 
				-      <p>Once a queue is selected, the Scheduler picks a job in the queue. Jobs
			
 
				-      are sorted based on when they're submitted and their priorities (if the 
			
 
				-      queue supports priorities). Jobs are considered in order, and a job is 
			
 
				-      selected if its user is within the user-quota for the queue, i.e., the 
			
 
				-      user is not already using queue resources above his/her limit. The 
			
 
				-      Scheduler also makes sure that there is enough free memory in the 
			
 
				-      TaskTracker to tun the job's task, in case the job has special memory
			
 
				-      requirements.</p>
			
 
				-      
			
 
				-      <p>Once a job is selected, the Scheduler picks a task to run. This logic 
			
 
				-      to pick a task remains unchanged from earlier versions.</p> 
			
 
				-      
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Installation</title>
			
 
				-      
			
 
				-        <p>The Capacity Scheduler is available as a JAR file in the Hadoop
			
 
				-        tarball under the <em>contrib/capacity-scheduler</em> directory. The name of 
			
 
				-        the JAR file would be on the lines of hadoop-*-capacity-scheduler.jar.</p>
			
 
				-        <p>You can also build the Scheduler from source by executing
			
 
				-        <em>ant package</em>, in which case it would be available under
			
 
				-        <em>build/contrib/capacity-scheduler</em>.</p>
			
 
				-        <p>To run the Capacity Scheduler in your Hadoop installation, you need 
			
 
				-        to put it on the <em>CLASSPATH</em>. The easiest way is to copy the 
			
 
				-        <code>hadoop-*-capacity-scheduler.jar</code> from 
			
 
				-        to <code>HADOOP_HOME/lib</code>. Alternatively, you can modify 
			
 
				-        <em>HADOOP_CLASSPATH</em> to include this jar, in 
			
 
				-        <code>conf/hadoop-env.sh</code>.</p>
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Configuration</title>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Using the Capacity Scheduler</title>
			
 
				-        <p>
			
 
				-          To make the Hadoop framework use the Capacity Scheduler, set up
			
 
				-          the following property in the site configuration:</p>
			
 
				-          <table>
			
 
				-            <tr>
			
 
				-              <td>Property</td>
			
 
				-              <td>Value</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-              <td>mapred.jobtracker.taskScheduler</td>
			
 
				-              <td>org.apache.hadoop.mapred.CapacityTaskScheduler</td>
			
 
				-            </tr>
			
 
				-          </table>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Setting up queues</title>
			
 
				-        <p>
			
 
				-          You can define multiple queues to which users can submit jobs with
			
 
				-          the Capacity Scheduler. To define multiple queues, you should edit
			
 
				-          the site configuration for Hadoop and modify the
			
 
				-          <em>mapred.queue.names</em> property.
			
 
				-        </p>
			
 
				-        <p>
			
 
				-          You can also configure ACLs for controlling which users or groups
			
 
				-          have access to the queues.
			
 
				-        </p>
			
 
				-        <p>
			
 
				-          For more details, refer to
			
 
				-          <a href="cluster_setup.html#Configuring+the+Hadoop+Daemons">Cluster 
			
 
				-          Setup</a> documentation.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-  
			
 
				-      <section>
			
 
				-        <title>Configuring properties for queues</title>
			
 
				-
			
 
				-        <p>The Capacity Scheduler can be configured with several properties
			
 
				-        for each queue that control the behavior of the Scheduler. This
			
 
				-        configuration is in the <em>conf/capacity-scheduler.xml</em>. By
			
 
				-        default, the configuration is set up for one queue, named 
			
 
				-        <em>default</em>.</p>
			
 
				-        <p>To specify a property for a queue that is defined in the site
			
 
				-        configuration, you should use the property name as
			
 
				-        <em>mapred.capacity-scheduler.queue.&lt;queue-name&gt;.&lt;property-name&gt;</em>.
			
 
				-        </p>
			
 
				-        <p>For example, to define the property <em>capacity</em>
			
 
				-        for queue named <em>research</em>, you should specify the property
			
 
				-        name as 
			
 
				-        <em>mapred.capacity-scheduler.queue.research.capacity</em>.
			
 
				-        </p>
			
 
				-
			
 
				-        <p>The properties defined for queues and their descriptions are
			
 
				-        listed in the table below:</p>
			
 
				-
			
 
				-        <table>
			
 
				-          <tr><th>Name</th><th>Description</th></tr>
			
 
				-          <tr><td>mapred.capacity-scheduler.queue.&lt;queue-name&gt;.capacity</td>
			
 
				-          	<td>Percentage of the number of slots in the cluster that are made 
			
 
				-            to be available for jobs in this queue. The sum of capacities 
			
 
				-            for all queues should be less than or equal 100.</td>
			
 
				-          </tr>
			
 
				-          <tr><td>mapred.capacity-scheduler.queue.&lt;queue-name&gt;.supports-priority</td>
			
 
				-          	<td>If true, priorities of jobs will be taken into account in scheduling 
			
 
				-          	decisions.</td>
			
 
				-          </tr>
			
 
				-          <tr><td>mapred.capacity-scheduler.queue.&lt;queue-name&gt;.minimum-user-limit-percent</td>
			
 
				-          	<td>Each queue enforces a limit on the percentage of resources 
			
 
				-          	allocated to a user at any given time, if there is competition 
			
 
				-          	for them. This user limit can vary between a minimum and maximum 
			
 
				-          	value. The former depends on the number of users who have submitted
			
 
				-          	jobs, and the latter is set to this property value. For example, 
			
 
				-          	suppose the value of this property is 25. If two users have 
			
 
				-          	submitted jobs to a queue, no single user can use more than 50% 
			
 
				-          	of the queue resources. If a third user submits a job, no single 
			
 
				-          	user can use more than 33% of the queue resources. With 4 or more 
			
 
				-          	users, no user can use more than 25% of the queue's resources. A 
			
 
				-          	value of 100 implies no user limits are imposed.</td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Memory management</title>
			
 
				-      
			
 
				-        <p>The Capacity Scheduler supports scheduling of tasks on a
			
 
				-        <code>TaskTracker</code>(TT) based on a job's memory requirements
			
 
				-        and the availability of RAM and Virtual Memory (VMEM) on the TT node.
			
 
				-        See the <a href="mapred_tutorial.html#Memory+monitoring">Hadoop 
			
 
				-        Map/Reduce tutorial</a> for details on how the TT monitors
			
 
				-        memory usage.</p>
			
 
				-        <p>Currently the memory based scheduling is only supported
			
 
				-        in Linux platform.</p>
			
 
				-        <p>Memory-based scheduling works as follows:</p>
			
 
				-        <ol>
			
 
				-          <li>The absence of any one or more of three config parameters 
			
 
				-          or -1 being set as value of any of the parameters, 
			
 
				-          <code>mapred.tasktracker.vmem.reserved</code>, 
			
 
				-          <code>mapred.task.default.maxvmem</code>, or
			
 
				-          <code>mapred.task.limit.maxvmem</code>, disables memory-based
			
 
				-          scheduling, just as it disables memory monitoring for a TT. These
			
 
				-          config parameters are described in the 
			
 
				-          <a href="mapred_tutorial.html#Memory+monitoring">Hadoop Map/Reduce 
			
 
				-          tutorial</a>. The value of  
			
 
				-          <code>mapred.tasktracker.vmem.reserved</code> is 
			
 
				-          obtained from the TT via its heartbeat. 
			
 
				-          </li>
			
 
				-          <li>If all the three mandatory parameters are set, the Scheduler 
			
 
				-          enables VMEM-based scheduling. First, the Scheduler computes the free
			
 
				-          VMEM on the TT. This is the difference between the available VMEM on the
			
 
				-          TT (the node's total VMEM minus the offset, both of which are sent by 
			
 
				-          the TT on each heartbeat)and the sum of VMs already allocated to 
			
 
				-          running tasks (i.e., sum of the VMEM task-limits). Next, the Scheduler
			
 
				-          looks at the VMEM requirements for the job that's first in line to 
			
 
				-          run. If the job's VMEM requirements are less than the available VMEM on 
			
 
				-          the node, the job's task can be scheduled. If not, the Scheduler 
			
 
				-          ensures that the TT does not get a task to run (provided the job 
			
 
				-          has tasks to run). This way, the Scheduler ensures that jobs with 
			
 
				-          high memory requirements are not starved, as eventually, the TT 
			
 
				-          will have enough VMEM available. If the high-mem job does not have 
			
 
				-          any task to run, the Scheduler moves on to the next job. 
			
 
				-          </li>
			
 
				-          <li>In addition to VMEM, the Capacity Scheduler can also consider 
			
 
				-          RAM on the TT node. RAM is considered the same way as VMEM. TTs report
			
 
				-          the total RAM available on their node, and an offset. If both are
			
 
				-          set, the Scheduler computes the available RAM on the node. Next, 
			
 
				-          the Scheduler figures out the RAM requirements of the job, if any. 
			
 
				-          As with VMEM, users can optionally specify a RAM limit for their job
			
 
				-          (<code>mapred.task.maxpmem</code>, described in the Map/Reduce 
			
 
				-          tutorial). The Scheduler also maintains a limit for this value 
			
 
				-          (<code>mapred.capacity-scheduler.task.default-pmem-percentage-in-vmem</code>, 
			
 
				-          described below). All these three values must be set for the 
			
 
				-          Scheduler to schedule tasks based on RAM constraints.
			
 
				-          </li>
			
 
				-          <li>The Scheduler ensures that jobs cannot ask for RAM or VMEM higher
			
 
				-          than configured limits. If this happens, the job is failed when it
			
 
				-          is submitted. 
			
 
				-          </li>
			
 
				-        </ol>
			
 
				-        
			
 
				-        <p>As described above, the additional scheduler-based config 
			
 
				-        parameters are as follows:</p>
			
 
				-
			
 
				-        <table>
			
 
				-          <tr><th>Name</th><th>Description</th></tr>
			
 
				-          <tr><td>mapred.capacity-scheduler.task.default-pmem-percentage-in-vmem</td>
			
 
				-          	<td>A percentage of the default VMEM limit for jobs
			
 
				-          	(<code>mapred.task.default.maxvmem</code>). This is the default 
			
 
				-          	RAM task-limit associated with a task. Unless overridden by a 
			
 
				-          	job's setting, this number defines the RAM task-limit.</td>
			
 
				-          </tr>
			
 
				-          <tr><td>mapred.capacity-scheduler.task.limit.maxpmem</td>
			
 
				-          <td>Configuration which provides an upper limit to maximum physical
			
 
				-           memory which can be specified by a job. If a job requires more 
			
 
				-           physical memory than what is specified in this limit then the same
			
 
				-           is rejected.</td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-      </section>
			
 
				-   <section>
			
 
				-        <title>Job Initialization Parameters</title>
			
 
				-        <p>Capacity scheduler lazily initializes the jobs before they are
			
 
				-        scheduled, for reducing the memory footprint on jobtracker. 
			
 
				-        Following are the parameters, by which you can control the laziness
			
 
				-        of the job initialization. The following parameters can be 
			
 
				-        configured in capacity-scheduler.xml
			
 
				-        </p>
			
 
				-        
			
 
				-        <table>
			
 
				-          <tr><th>Name</th><th>Description</th></tr>
			
 
				-          <tr>
			
 
				-            <td>
			
 
				-              mapred.capacity-scheduler.queue.&lt;queue-name&gt;.maximum-initialized-jobs-per-user
			
 
				-            </td>
			
 
				-            <td>
			
 
				-              Maximum number of jobs which are allowed to be pre-initialized for
			
 
				-              a particular user in the queue. Once a job is scheduled, i.e. 
			
 
				-              it starts running, then that job is not considered
			
 
				-              while scheduler computes the maximum job a user is allowed to
			
 
				-              initialize. 
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>
			
 
				-              mapred.capacity-scheduler.init-poll-interval
			
 
				-            </td>
			
 
				-            <td>
			
 
				-              Amount of time in miliseconds which is used to poll the scheduler
			
 
				-              job queue to look for jobs to be initialized.
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>
			
 
				-              mapred.capacity-scheduler.init-worker-threads
			
 
				-            </td>
			
 
				-            <td>
			
 
				-              Number of worker threads which would be used by Initialization
			
 
				-              poller to initialize jobs in a set of queue. If number mentioned 
			
 
				-              in property is equal to number of job queues then a thread is 
			
 
				-              assigned jobs from one queue. If the number configured is lesser than
			
 
				-              number of queues, then a thread can get jobs from more than one queue
			
 
				-              which it initializes in a round robin fashion. If the number configured
			
 
				-              is greater than number of queues, then number of threads spawned
			
 
				-              would be equal to number of job queues.
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-      </section>   
			
 
				-      <section>
			
 
				-        <title>Reviewing the configuration of the Capacity Scheduler</title>
			
 
				-        <p>
			
 
				-          Once the installation and configuration is completed, you can review
			
 
				-          it after starting the Map/Reduce cluster from the admin UI.
			
 
				-        </p>
			
 
				-        <ul>
			
 
				-          <li>Start the Map/Reduce cluster as usual.</li>
			
 
				-          <li>Open the JobTracker web UI.</li>
			
 
				-          <li>The queues you have configured should be listed under the <em>Scheduling
			
 
				-              Information</em> section of the page.</li>
			
 
				-          <li>The properties for the queues should be visible in the <em>Scheduling
			
 
				-              Information</em> column against each queue.</li>
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-      
			
 
				-   </section>
			
 
				-  </body>
			
 
				-  
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/cluster_setup.xml
+++ b/src/docs/src/documentation/content/xdocs/cluster_setup.xml
@@ -1,879 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-<document>
			
 
				-  
			
 
				-  <header>
			
 
				-    <title>Cluster Setup</title>
			
 
				-  </header>
			
 
				-  
			
 
				-  <body>
			
 
				-  
			
 
				-    <section>
			
 
				-      <title>Purpose</title>
			
 
				-      
			
 
				-      <p>This document describes how to install, configure and manage non-trivial
			
 
				-      Hadoop clusters ranging from a few nodes to extremely large clusters with 
			
 
				-      thousands of nodes.</p>
			
 
				-      <p>
			
 
				-      To play with Hadoop, you may first want to install Hadoop on a single machine (see <a href="quickstart.html"> Hadoop Quick Start</a>).
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Pre-requisites</title>
			
 
				-      
			
 
				-      <ol>
			
 
				-        <li>
			
 
				-          Make sure all <a href="quickstart.html#PreReqs">requisite</a> software 
			
 
				-          is installed on all nodes in your cluster.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          <a href="quickstart.html#Download">Get</a> the Hadoop software.
			
 
				-        </li>
			
 
				-      </ol>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Installation</title>
			
 
				-      
			
 
				-      <p>Installing a Hadoop cluster typically involves unpacking the software 
			
 
				-      on all the machines in the cluster.</p>
			
 
				-      
			
 
				-      <p>Typically one machine in the cluster is designated as the 
			
 
				-      <code>NameNode</code> and another machine the as <code>JobTracker</code>,
			
 
				-      exclusively. These are the <em>masters</em>. The rest of the machines in 
			
 
				-      the cluster act as both <code>DataNode</code> <em>and</em> 
			
 
				-      <code>TaskTracker</code>. These are the <em>slaves</em>.</p>
			
 
				-      
			
 
				-      <p>The root of the distribution is referred to as 
			
 
				-      <code>HADOOP_HOME</code>. All machines in the cluster usually have the same 
			
 
				-      <code>HADOOP_HOME</code> path.</p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Configuration</title>
			
 
				-      
			
 
				-      <p>The following sections describe how to configure a Hadoop cluster.</p>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Configuration Files</title>
			
 
				-        
			
 
				-        <p>Hadoop configuration is driven by two types of important 
			
 
				-        configuration files:</p>
			
 
				-        <ol>
			
 
				-          <li>
			
 
				-            Read-only default configuration - 
			
 
				-            <a href="ext:core-default">src/core/core-default.xml</a>, 
			
 
				-            <a href="ext:hdfs-default">src/hdfs/hdfs-default.xml</a> and 
			
 
				-            <a href="ext:mapred-default">src/mapred/mapred-default.xml</a>.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Site-specific configuration - 
			
 
				-            <em>conf/core-site.xml</em>, 
			
 
				-            <em>conf/hdfs-site.xml</em> and 
			
 
				-            <em>conf/mapred-site.xml</em>.
			
 
				-          </li>
			
 
				-        </ol>
			
 
				-      
			
 
				-        <p>To learn more about how the Hadoop framework is controlled by these 
			
 
				-        configuration files, look 
			
 
				-        <a href="ext:api/org/apache/hadoop/conf/configuration">here</a>.</p>
			
 
				-      
			
 
				-        <p>Additionally, you can control the Hadoop scripts found in the 
			
 
				-        <code>bin/</code> directory of the distribution, by setting site-specific 
			
 
				-        values via the <code>conf/hadoop-env.sh</code>.</p>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Site Configuration</title>
			
 
				-        
			
 
				-        <p>To configure the Hadoop cluster you will need to configure the
			
 
				-        <em>environment</em> in which the Hadoop daemons execute as well as
			
 
				-        the <em>configuration parameters</em> for the Hadoop daemons.</p>
			
 
				-        
			
 
				-        <p>The Hadoop daemons are <code>NameNode</code>/<code>DataNode</code> 
			
 
				-        and <code>JobTracker</code>/<code>TaskTracker</code>.</p>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Configuring the Environment of the Hadoop Daemons</title>
			
 
				-
			
 
				-          <p>Administrators should use the <code>conf/hadoop-env.sh</code> script
			
 
				-          to do site-specific customization of the Hadoop daemons' process 
			
 
				-          environment.</p> 
			
 
				-          
			
 
				-          <p>At the very least you should specify the
			
 
				-          <code>JAVA_HOME</code> so that it is correctly defined on each
			
 
				-          remote node.</p>
			
 
				-          
			
 
				-          <p>Administrators can configure individual daemons using the
			
 
				-          configuration options <code>HADOOP_*_OPTS</code>. Various options 
			
 
				-          available are shown below in the table. </p>
			
 
				-          <table>
			
 
				-          <tr><th>Daemon</th><th>Configure Options</th></tr>
			
 
				-          <tr><td>NameNode</td><td>HADOOP_NAMENODE_OPTS</td></tr>
			
 
				-          <tr><td>DataNode</td><td>HADOOP_DATANODE_OPTS</td></tr>
			
 
				-          <tr><td>SecondaryNamenode</td>
			
 
				-              <td>HADOOP_SECONDARYNAMENODE_OPTS</td></tr>
			
 
				-          <tr><td>JobTracker</td><td>HADOOP_JOBTRACKER_OPTS</td></tr>
			
 
				-          <tr><td>TaskTracker</td><td>HADOOP_TASKTRACKER_OPTS</td></tr>
			
 
				-          </table>
			
 
				-          
			
 
				-          <p> For example, To configure Namenode to use parallelGC, the
			
 
				-          following statement should be added in <code>hadoop-env.sh</code> :
			
 
				-          <br/><code>
			
 
				-          export HADOOP_NAMENODE_OPTS="-XX:+UseParallelGC ${HADOOP_NAMENODE_OPTS}"
			
 
				-          </code><br/></p>
			
 
				-          
			
 
				-          <p>Other useful configuration parameters that you can customize 
			
 
				-          include:</p>
			
 
				-          <ul>
			
 
				-            <li>
			
 
				-              <code>HADOOP_LOG_DIR</code> - The directory where the daemons'
			
 
				-              log files are stored. They are automatically created if they don't
			
 
				-              exist.
			
 
				-            </li>
			
 
				-            <li>
			
 
				-              <code>HADOOP_HEAPSIZE</code> - The maximum amount of heapsize 
			
 
				-              to use, in MB e.g. <code>1000MB</code>. This is used to 
			
 
				-              configure the heap size for the hadoop daemon. By default,
			
 
				-              the value is <code>1000MB</code>.
			
 
				-            </li>
			
 
				-          </ul>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Configuring the Hadoop Daemons</title>
			
 
				-          
			
 
				-          <p>This section deals with important parameters to be specified in the
			
 
				-          following:
			
 
				-          <br/>
			
 
				-          <code>conf/core-site.xml</code>:</p>
			
 
				-
			
 
				-		  <table>
			
 
				-  		    <tr>
			
 
				-		      <th>Parameter</th>
			
 
				-		      <th>Value</th> 
			
 
				-		      <th>Notes</th>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-              <td>fs.default.name</td>
			
 
				-              <td>URI of <code>NameNode</code>.</td>
			
 
				-              <td><em>hdfs://hostname/</em></td>
			
 
				-            </tr>
			
 
				-          </table>
			
 
				-
			
 
				-      <p><br/><code>conf/hdfs-site.xml</code>:</p>
			
 
				-          
			
 
				-      <table>   
			
 
				-        <tr>
			
 
				-          <th>Parameter</th>
			
 
				-          <th>Value</th> 
			
 
				-          <th>Notes</th>
			
 
				-        </tr>
			
 
				-		    <tr>
			
 
				-		      <td>dfs.name.dir</td>
			
 
				-		      <td>
			
 
				-		        Path on the local filesystem where the <code>NameNode</code> 
			
 
				-		        stores the namespace and transactions logs persistently.</td>
			
 
				-		      <td>
			
 
				-		        If this is a comma-delimited list of directories then the name 
			
 
				-		        table is replicated in all of the directories, for redundancy.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>dfs.data.dir</td>
			
 
				-		      <td>
			
 
				-		        Comma separated list of paths on the local filesystem of a 
			
 
				-		        <code>DataNode</code> where it should store its blocks.
			
 
				-		      </td>
			
 
				-		      <td>
			
 
				-		        If this is a comma-delimited list of directories, then data will 
			
 
				-		        be stored in all named directories, typically on different 
			
 
				-		        devices.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-      </table>
			
 
				-
			
 
				-      <p><br/><code>conf/mapred-site.xml</code>:</p>
			
 
				-
			
 
				-      <table>
			
 
				-          <tr>
			
 
				-          <th>Parameter</th>
			
 
				-          <th>Value</th> 
			
 
				-          <th>Notes</th>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td>mapred.job.tracker</td>
			
 
				-          <td>Host or IP and port of <code>JobTracker</code>.</td>
			
 
				-          <td><em>host:port</em> pair.</td>
			
 
				-        </tr>
			
 
				-		    <tr>
			
 
				-		      <td>mapred.system.dir</td>
			
 
				-		      <td>
			
 
				-		        Path on the HDFS where where the Map/Reduce framework stores 
			
 
				-		        system files e.g. <code>/hadoop/mapred/system/</code>.
			
 
				-		      </td>
			
 
				-		      <td>
			
 
				-		        This is in the default filesystem (HDFS) and must be accessible 
			
 
				-		        from both the server and client machines.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>mapred.local.dir</td>
			
 
				-		      <td>
			
 
				-		        Comma-separated list of paths on the local filesystem where 
			
 
				-		        temporary Map/Reduce data is written.
			
 
				-		      </td>
			
 
				-		      <td>Multiple paths help spread disk i/o.</td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>mapred.tasktracker.{map|reduce}.tasks.maximum</td>
			
 
				-		      <td>
			
 
				-		        The maximum number of Map/Reduce tasks, which are run 
			
 
				-		        simultaneously on a given <code>TaskTracker</code>, individually.
			
 
				-		      </td>
			
 
				-		      <td>
			
 
				-		        Defaults to 2 (2 maps and 2 reduces), but vary it depending on 
			
 
				-		        your hardware.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>dfs.hosts/dfs.hosts.exclude</td>
			
 
				-		      <td>List of permitted/excluded DataNodes.</td>
			
 
				-		      <td>
			
 
				-		        If necessary, use these files to control the list of allowable 
			
 
				-		        datanodes.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>mapred.hosts/mapred.hosts.exclude</td>
			
 
				-		      <td>List of permitted/excluded TaskTrackers.</td>
			
 
				-		      <td>
			
 
				-		        If necessary, use these files to control the list of allowable 
			
 
				-		        TaskTrackers.
			
 
				-		      </td>
			
 
				-  		    </tr>
			
 
				-        <tr>
			
 
				-          <td>mapred.queue.names</td>
			
 
				-          <td>Comma separated list of queues to which jobs can be submitted.</td>
			
 
				-          <td>
			
 
				-            The Map/Reduce system always supports atleast one queue
			
 
				-            with the name as <em>default</em>. Hence, this parameter's
			
 
				-            value should always contain the string <em>default</em>.
			
 
				-            Some job schedulers supported in Hadoop, like the 
			
 
				-            <a href="capacity_scheduler.html">Capacity 
			
 
				-            Scheduler</a>, support multiple queues. If such a scheduler is
			
 
				-            being used, the list of configured queue names must be
			
 
				-            specified here. Once queues are defined, users can submit
			
 
				-            jobs to a queue using the property name 
			
 
				-            <em>mapred.job.queue.name</em> in the job configuration.
			
 
				-            There could be a separate 
			
 
				-            configuration file for configuring properties of these 
			
 
				-            queues that is managed by the scheduler. 
			
 
				-            Refer to the documentation of the scheduler for information on 
			
 
				-            the same.
			
 
				-          </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td>mapred.acls.enabled</td>
			
 
				-          <td>Specifies whether ACLs are supported for controlling job
			
 
				-              submission and administration</td>
			
 
				-          <td>
			
 
				-            If <em>true</em>, ACLs would be checked while submitting
			
 
				-            and administering jobs. ACLs can be specified using the
			
 
				-            configuration parameters of the form
			
 
				-            <em>mapred.queue.queue-name.acl-name</em>, defined below.
			
 
				-          </td>
			
 
				-        </tr>
			
 
				-		  </table>
			
 
				-      
			
 
				-      <p><br/><code> conf/mapred-queue-acls.xml</code></p>
			
 
				-      
			
 
				-      <table>
			
 
				-       <tr>
			
 
				-          <th>Parameter</th>
			
 
				-          <th>Value</th> 
			
 
				-          <th>Notes</th>
			
 
				-       </tr>
			
 
				-        <tr>
			
 
				-          <td>mapred.queue.<em>queue-name</em>.acl-submit-job</td>
			
 
				-          <td>List of users and groups that can submit jobs to the
			
 
				-              specified <em>queue-name</em>.</td>
			
 
				-          <td>
			
 
				-            The list of users and groups are both comma separated
			
 
				-            list of names. The two lists are separated by a blank.
			
 
				-            Example: <em>user1,user2 group1,group2</em>.
			
 
				-            If you wish to define only a list of groups, provide
			
 
				-            a blank at the beginning of the value.
			
 
				-          </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td>mapred.queue.<em>queue-name</em>.acl-administer-job</td>
			
 
				-          <td>List of users and groups that can change the priority
			
 
				-              or kill jobs that have been submitted to the
			
 
				-              specified <em>queue-name</em>.</td>
			
 
				-          <td>
			
 
				-            The list of users and groups are both comma separated
			
 
				-            list of names. The two lists are separated by a blank.
			
 
				-            Example: <em>user1,user2 group1,group2</em>.
			
 
				-            If you wish to define only a list of groups, provide
			
 
				-            a blank at the beginning of the value. Note that an
			
 
				-            owner of a job can always change the priority or kill
			
 
				-            his/her own job, irrespective of the ACLs.
			
 
				-          </td>
			
 
				-        </tr>
			
 
				-      </table>
			
 
				-      
			
 
				-
			
 
				-          <p>Typically all the above parameters are marked as 
			
 
				-          <a href="ext:api/org/apache/hadoop/conf/configuration/final_parameters">
			
 
				-          final</a> to ensure that they cannot be overriden by user-applications.
			
 
				-          </p>
			
 
				-
			
 
				-          <section>
			
 
				-            <title>Real-World Cluster Configurations</title>
			
 
				-            
			
 
				-            <p>This section lists some non-default configuration parameters which 
			
 
				-            have been used to run the <em>sort</em> benchmark on very large 
			
 
				-            clusters.</p>
			
 
				-            
			
 
				-            <ul>
			
 
				-              <li>
			
 
				-                <p>Some non-default configuration values used to run sort900,
			
 
				-                that is 9TB of data sorted on a cluster with 900 nodes:</p>
			
 
				-                <table>
			
 
				-  		          <tr>
			
 
				-                <th>Configuration File</th>
			
 
				-		            <th>Parameter</th>
			
 
				-		            <th>Value</th> 
			
 
				-		            <th>Notes</th>
			
 
				-		          </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/hdfs-site.xml</td>
			
 
				-                    <td>dfs.block.size</td>
			
 
				-                    <td>134217728</td>
			
 
				-                    <td>HDFS blocksize of 128MB for large file-systems.</td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/hdfs-site.xml</td>
			
 
				-                    <td>dfs.namenode.handler.count</td>
			
 
				-                    <td>40</td>
			
 
				-                    <td>
			
 
				-                      More NameNode server threads to handle RPCs from large 
			
 
				-                      number of DataNodes.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapred.reduce.parallel.copies</td>
			
 
				-                    <td>20</td>
			
 
				-                    <td>
			
 
				-                      Higher number of parallel copies run by reduces to fetch
			
 
				-                      outputs from very large number of maps.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapred.child.java.opts</td>
			
 
				-                    <td>-Xmx512M</td>
			
 
				-                    <td>
			
 
				-                      Larger heap-size for child jvms of maps/reduces. 
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/core-site.xml</td>
			
 
				-                    <td>fs.inmemory.size.mb</td>
			
 
				-                    <td>200</td>
			
 
				-                    <td>
			
 
				-                      Larger amount of memory allocated for the in-memory 
			
 
				-                      file-system used to merge map-outputs at the reduces.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/core-site.xml</td>
			
 
				-                    <td>io.sort.factor</td>
			
 
				-                    <td>100</td>
			
 
				-                    <td>More streams merged at once while sorting files.</td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/core-site.xml</td>
			
 
				-                    <td>io.sort.mb</td>
			
 
				-                    <td>200</td>
			
 
				-                    <td>Higher memory-limit while sorting data.</td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/core-site.xml</td>
			
 
				-                    <td>io.file.buffer.size</td>
			
 
				-                    <td>131072</td>
			
 
				-                    <td>Size of read/write buffer used in SequenceFiles.</td>
			
 
				-                  </tr>
			
 
				-                </table>
			
 
				-              </li>
			
 
				-              <li>
			
 
				-                <p>Updates to some configuration values to run sort1400 and 
			
 
				-                sort2000, that is 14TB of data sorted on 1400 nodes and 20TB of
			
 
				-                data sorted on 2000 nodes:</p>
			
 
				-                <table>
			
 
				-  		          <tr>
			
 
				-                <th>Configuration File</th>
			
 
				-		            <th>Parameter</th>
			
 
				-		            <th>Value</th> 
			
 
				-		            <th>Notes</th>
			
 
				-		          </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapred.job.tracker.handler.count</td>
			
 
				-                    <td>60</td>
			
 
				-                    <td>
			
 
				-                      More JobTracker server threads to handle RPCs from large 
			
 
				-                      number of TaskTrackers.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapred.reduce.parallel.copies</td>
			
 
				-                    <td>50</td>
			
 
				-                    <td></td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>tasktracker.http.threads</td>
			
 
				-                    <td>50</td>
			
 
				-                    <td>
			
 
				-                      More worker threads for the TaskTracker's http server. The
			
 
				-                      http server is used by reduces to fetch intermediate 
			
 
				-                      map-outputs.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapred.child.java.opts</td>
			
 
				-                    <td>-Xmx1024M</td>
			
 
				-                    <td>Larger heap-size for child jvms of maps/reduces.</td>
			
 
				-                  </tr>
			
 
				-                </table>
			
 
				-              </li>
			
 
				-            </ul>
			
 
				-          </section>
			
 
				-                  <section>
			
 
				-        <title> Memory management</title>
			
 
				-        <p>Users/admins can also specify the maximum virtual memory 
			
 
				-        of the launched child-task, and any sub-process it launches 
			
 
				-        recursively, using <code>mapred.child.ulimit</code>. Note that
			
 
				-        the value set here is a per process limit.
			
 
				-        The value for <code>mapred.child.ulimit</code> should be specified 
			
 
				-        in kilo bytes (KB). And also the value must be greater than
			
 
				-        or equal to the -Xmx passed to JavaVM, else the VM might not start. 
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Note: <code>mapred.child.java.opts</code> are used only for 
			
 
				-        configuring the launched child tasks from task tracker. Configuring 
			
 
				-        the memory options for daemons is documented in 
			
 
				-        <a href="cluster_setup.html#Configuring+the+Environment+of+the+Hadoop+Daemons">
			
 
				-        cluster_setup.html </a></p>
			
 
				-        
			
 
				-        <p>The memory available to some parts of the framework is also
			
 
				-        configurable. In map and reduce tasks, performance may be influenced
			
 
				-        by adjusting parameters influencing the concurrency of operations and
			
 
				-        the frequency with which data will hit disk. Monitoring the filesystem
			
 
				-        counters for a job- particularly relative to byte counts from the map
			
 
				-        and into the reduce- is invaluable to the tuning of these
			
 
				-        parameters.</p>
			
 
				-        </section>
			
 
				-
			
 
				-        <section>
			
 
				-        <title> Memory monitoring</title>
			
 
				-        <p>A <code>TaskTracker</code>(TT) can be configured to monitor memory 
			
 
				-        usage of tasks it spawns, so that badly-behaved jobs do not bring 
			
 
				-        down a machine due to excess memory consumption. With monitoring 
			
 
				-        enabled, every task is assigned a task-limit for virtual memory (VMEM). 
			
 
				-        In addition, every node is assigned a node-limit for VMEM usage. 
			
 
				-        A TT ensures that a task is killed if it, and 
			
 
				-        its descendants, use VMEM over the task's per-task limit. It also 
			
 
				-        ensures that one or more tasks are killed if the sum total of VMEM 
			
 
				-        usage by all tasks, and their descendents, cross the node-limit.</p>
			
 
				-        
			
 
				-        <p>Users can, optionally, specify the VMEM task-limit per job. If no
			
 
				-        such limit is provided, a default limit is used. A node-limit can be 
			
 
				-        set per node.</p>   
			
 
				-        <p>Currently the memory monitoring and management is only supported
			
 
				-        in Linux platform.</p>
			
 
				-        <p>To enable monitoring for a TT, the 
			
 
				-        following parameters all need to be set:</p> 
			
 
				-
			
 
				-        <table>
			
 
				-          <tr><th>Name</th><th>Type</th><th>Description</th></tr>
			
 
				-          <tr><td>mapred.tasktracker.vmem.reserved</td><td>long</td>
			
 
				-            <td>A number, in bytes, that represents an offset. The total VMEM on 
			
 
				-            the machine, minus this offset, is the VMEM node-limit for all 
			
 
				-            tasks, and their descendants, spawned by the TT. 
			
 
				-          </td></tr>
			
 
				-          <tr><td>mapred.task.default.maxvmem</td><td>long</td>
			
 
				-            <td>A number, in bytes, that represents the default VMEM task-limit 
			
 
				-            associated with a task. Unless overridden by a job's setting, 
			
 
				-            this number defines the VMEM task-limit.   
			
 
				-          </td></tr>
			
 
				-          <tr><td>mapred.task.limit.maxvmem</td><td>long</td>
			
 
				-            <td>A number, in bytes, that represents the upper VMEM task-limit 
			
 
				-            associated with a task. Users, when specifying a VMEM task-limit 
			
 
				-            for their tasks, should not specify a limit which exceeds this amount. 
			
 
				-          </td></tr>
			
 
				-        </table>
			
 
				-        
			
 
				-        <p>In addition, the following parameters can also be configured.</p>
			
 
				-
			
 
				-    <table>
			
 
				-          <tr><th>Name</th><th>Type</th><th>Description</th></tr>
			
 
				-          <tr><td>mapred.tasktracker.taskmemorymanager.monitoring-interval</td>
			
 
				-            <td>long</td>
			
 
				-            <td>The time interval, in milliseconds, between which the TT 
			
 
				-            checks for any memory violation. The default value is 5000 msec
			
 
				-            (5 seconds). 
			
 
				-          </td></tr>
			
 
				-        </table>
			
 
				-        
			
 
				-        <p>Here's how the memory monitoring works for a TT.</p>
			
 
				-        <ol>
			
 
				-          <li>If one or more of the configuration parameters described 
			
 
				-          above are missing or -1 is specified , memory monitoring is 
			
 
				-          disabled for the TT.
			
 
				-          </li>
			
 
				-          <li>In addition, monitoring is disabled if 
			
 
				-          <code>mapred.task.default.maxvmem</code> is greater than 
			
 
				-          <code>mapred.task.limit.maxvmem</code>. 
			
 
				-          </li>
			
 
				-          <li>If a TT receives a task whose task-limit is set by the user
			
 
				-          to a value larger than <code>mapred.task.limit.maxvmem</code>, it 
			
 
				-          logs a warning but executes the task.
			
 
				-          </li> 
			
 
				-          <li>Periodically, the TT checks the following: 
			
 
				-          <ul>
			
 
				-            <li>If any task's current VMEM usage is greater than that task's
			
 
				-            VMEM task-limit, the task is killed and reason for killing 
			
 
				-            the task is logged in task diagonistics . Such a task is considered 
			
 
				-            failed, i.e., the killing counts towards the task's failure count.
			
 
				-            </li> 
			
 
				-            <li>If the sum total of VMEM used by all tasks and descendants is 
			
 
				-            greater than the node-limit, the TT kills enough tasks, in the
			
 
				-            order of least progress made, till the overall VMEM usage falls
			
 
				-            below the node-limt. Such killed tasks are not considered failed
			
 
				-            and their killing does not count towards the tasks' failure counts.
			
 
				-            </li>
			
 
				-          </ul>
			
 
				-          </li>
			
 
				-        </ol>
			
 
				-        
			
 
				-        <p>Schedulers can choose to ease the monitoring pressure on the TT by 
			
 
				-        preventing too many tasks from running on a node and by scheduling 
			
 
				-        tasks only if the TT has enough VMEM free. In addition, Schedulers may 
			
 
				-        choose to consider the physical memory (RAM) available on the node
			
 
				-        as well. To enable Scheduler support, TTs report their memory settings 
			
 
				-        to the JobTracker in every heartbeat. Before getting into details, 
			
 
				-        consider the following additional memory-related parameters than can be 
			
 
				-        configured to enable better scheduling:</p> 
			
 
				-
			
 
				-        <table>
			
 
				-          <tr><th>Name</th><th>Type</th><th>Description</th></tr>
			
 
				-          <tr><td>mapred.tasktracker.pmem.reserved</td><td>int</td>
			
 
				-            <td>A number, in bytes, that represents an offset. The total 
			
 
				-            physical memory (RAM) on the machine, minus this offset, is the 
			
 
				-            recommended RAM node-limit. The RAM node-limit is a hint to a
			
 
				-            Scheduler to scheduler only so many tasks such that the sum 
			
 
				-            total of their RAM requirements does not exceed this limit. 
			
 
				-            RAM usage is not monitored by a TT.   
			
 
				-          </td></tr>
			
 
				-        </table>
			
 
				-        
			
 
				-        <p>A TT reports the following memory-related numbers in every 
			
 
				-        heartbeat:</p>
			
 
				-        <ul>
			
 
				-          <li>The total VMEM available on the node.</li>
			
 
				-          <li>The value of <code>mapred.tasktracker.vmem.reserved</code>,
			
 
				-           if set.</li>
			
 
				-          <li>The total RAM available on the node.</li> 
			
 
				-          <li>The value of <code>mapred.tasktracker.pmem.reserved</code>,
			
 
				-           if set.</li>
			
 
				-         </ul>
			
 
				-        </section>
			
 
				-        
			
 
				-          <section>
			
 
				-            <title>Task Controllers</title>
			
 
				-            <p>Task controllers are classes in the Hadoop Map/Reduce 
			
 
				-            framework that define how user's map and reduce tasks 
			
 
				-            are launched and controlled. They can 
			
 
				-            be used in clusters that require some customization in 
			
 
				-            the process of launching or controlling the user tasks.
			
 
				-            For example, in some 
			
 
				-            clusters, there may be a requirement to run tasks as 
			
 
				-            the user who submitted the job, instead of as the task 
			
 
				-            tracker user, which is how tasks are launched by default.
			
 
				-            This section describes how to configure and use 
			
 
				-            task controllers.</p>
			
 
				-            <p>The following task controllers are the available in
			
 
				-            Hadoop.
			
 
				-            </p>
			
 
				-            <table>
			
 
				-            <tr><th>Name</th><th>Class Name</th><th>Description</th></tr>
			
 
				-            <tr>
			
 
				-            <td>DefaultTaskController</td>
			
 
				-            <td>org.apache.hadoop.mapred.DefaultTaskController</td>
			
 
				-            <td> The default task controller which Hadoop uses to manage task
			
 
				-            execution. The tasks run as the task tracker user.</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-            <td>LinuxTaskController</td>
			
 
				-            <td>org.apache.hadoop.mapred.LinuxTaskController</td>
			
 
				-            <td>This task controller, which is supported only on Linux, 
			
 
				-            runs the tasks as the user who submitted the job. It requires
			
 
				-            these user accounts to be created on the cluster nodes 
			
 
				-            where the tasks are launched. It 
			
 
				-            uses a setuid executable that is included in the Hadoop
			
 
				-            distribution. The task tracker uses this executable to 
			
 
				-            launch and kill tasks. The setuid executable switches to
			
 
				-            the user who has submitted the job and launches or kills
			
 
				-            the tasks. Currently, this task controller 
			
 
				-            opens up permissions to local files and directories used 
			
 
				-            by the tasks such as the job jar files, distributed archive 
			
 
				-            files, intermediate files and task log files. In future,
			
 
				-            it is expected that stricter file permissions are used.
			
 
				-            </td>
			
 
				-            </tr>
			
 
				-            </table>
			
 
				-            <section>
			
 
				-            <title>Configuring Task Controllers</title>
			
 
				-            <p>The task controller to be used can be configured by setting the
			
 
				-            value of the following key in mapred-site.xml</p>
			
 
				-            <table>
			
 
				-            <tr>
			
 
				-            <th>Property</th><th>Value</th><th>Notes</th>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-            <td>mapred.task.tracker.task-controller</td>
			
 
				-            <td>Fully qualified class name of the task controller class</td>
			
 
				-            <td>Currently there are two implementations of task controller
			
 
				-            in the Hadoop system, DefaultTaskController and LinuxTaskController.
			
 
				-            Refer to the class names mentioned above to determine the value
			
 
				-            to set for the class of choice.
			
 
				-            </td>
			
 
				-            </tr>
			
 
				-            </table>
			
 
				-            </section>
			
 
				-            <section>
			
 
				-            <title>Using the LinuxTaskController</title>
			
 
				-            <p>This section of the document describes the steps required to
			
 
				-            use the LinuxTaskController.</p>
			
 
				-            
			
 
				-            <p>In order to use the LinuxTaskController, a setuid executable
			
 
				-            should be built and deployed on the compute nodes. The
			
 
				-            executable is named task-controller. To build the executable, 
			
 
				-            execute 
			
 
				-            <em>ant task-controller -Dhadoop.conf.dir=/path/to/conf/dir.
			
 
				-            </em>
			
 
				-            The path passed in <em>-Dhadoop.conf.dir</em> should be the path
			
 
				-            on the cluster nodes where a configuration file for the setuid
			
 
				-            executable would be located. The executable would be built to
			
 
				-            <em>build.dir/dist.dir/bin</em> and should be installed to 
			
 
				-            <em>$HADOOP_HOME/bin</em>.
			
 
				-            </p>
			
 
				-            
			
 
				-            <p>
			
 
				-            The executable must be deployed as a setuid executable, by changing
			
 
				-            the ownership to <em>root</em> and giving it permissions <em>4755</em>. 
			
 
				-            </p>
			
 
				-            
			
 
				-            <p>The executable requires a configuration file called 
			
 
				-            <em>taskcontroller.cfg</em> to be
			
 
				-            present in the configuration directory passed to the ant target 
			
 
				-            mentioned above. If the binary was not built with a specific 
			
 
				-            conf directory, the path defaults to <em>/path-to-binary/../conf</em>.
			
 
				-            </p>
			
 
				-            
			
 
				-            <p>The executable requires following configuration items to be 
			
 
				-            present in the <em>taskcontroller.cfg</em> file. The items should
			
 
				-            be mentioned as simple <em>key=value</em> pairs.
			
 
				-            </p>
			
 
				-            <table><tr><th>Name</th><th>Description</th></tr>
			
 
				-            <tr>
			
 
				-            <td>mapred.local.dir</td>
			
 
				-            <td>Path to mapred local directories. Should be same as the value 
			
 
				-            which was provided to key in mapred-site.xml. This is required to
			
 
				-            validate paths passed to the setuid executable in order to prevent
			
 
				-            arbitrary paths being passed to it.</td>
			
 
				-            </tr>
			
 
				-            </table>
			
 
				-
			
 
				-            <p>
			
 
				-            The LinuxTaskController requires that paths leading up to
			
 
				-            the directories specified in
			
 
				-            <em>mapred.local.dir</em> and <em>hadoop.log.dir</em> to be 755
			
 
				-            and directories themselves having 777 permissions.
			
 
				-            </p>
			
 
				-            </section>
			
 
				-            
			
 
				-          </section>
			
 
				-          
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Slaves</title>
			
 
				-          
			
 
				-          <p>Typically you choose one machine in the cluster to act as the 
			
 
				-          <code>NameNode</code> and one machine as to act as the 
			
 
				-          <code>JobTracker</code>, exclusively. The rest of the machines act as 
			
 
				-          both a <code>DataNode</code> and <code>TaskTracker</code> and are 
			
 
				-          referred to as <em>slaves</em>.</p>
			
 
				-          
			
 
				-          <p>List all slave hostnames or IP addresses in your 
			
 
				-          <code>conf/slaves</code> file, one per line.</p>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Logging</title>
			
 
				-          
			
 
				-          <p>Hadoop uses the <a href="http://logging.apache.org/log4j/">Apache 
			
 
				-          log4j</a> via the <a href="http://commons.apache.org/logging/">Apache 
			
 
				-          Commons Logging</a> framework for logging. Edit the 
			
 
				-          <code>conf/log4j.properties</code> file to customize the Hadoop 
			
 
				-          daemons' logging configuration (log-formats and so on).</p>
			
 
				-          
			
 
				-          <section>
			
 
				-            <title>History Logging</title>
			
 
				-            
			
 
				-            <p> The job history files are stored in central location 
			
 
				-            <code> hadoop.job.history.location </code> which can be on DFS also,
			
 
				-            whose default value is <code>${HADOOP_LOG_DIR}/history</code>. 
			
 
				-            The history web UI is accessible from job tracker web UI.</p>
			
 
				-            
			
 
				-            <p> The history files are also logged to user specified directory
			
 
				-            <code>hadoop.job.history.user.location</code> 
			
 
				-            which defaults to job output directory. The files are stored in
			
 
				-            "_logs/history/" in the specified directory. Hence, by default 
			
 
				-            they will be in "mapred.output.dir/_logs/history/". User can stop
			
 
				-            logging by giving the value <code>none</code> for 
			
 
				-            <code>hadoop.job.history.user.location</code> </p>
			
 
				-            
			
 
				-            <p> User can view the history logs summary in specified directory 
			
 
				-            using the following command <br/>
			
 
				-            <code>$ bin/hadoop job -history output-dir</code><br/> 
			
 
				-            This command will print job details, failed and killed tip
			
 
				-            details. <br/>
			
 
				-            More details about the job such as successful tasks and 
			
 
				-            task attempts made for each task can be viewed using the  
			
 
				-            following command <br/>
			
 
				-            <code>$ bin/hadoop job -history all output-dir</code><br/></p> 
			
 
				-          </section>
			
 
				-        </section>
			
 
				-      </section>
			
 
				-      
			
 
				-      <p>Once all the necessary configuration is complete, distribute the files
			
 
				-      to the <code>HADOOP_CONF_DIR</code> directory on all the machines, 
			
 
				-      typically <code>${HADOOP_HOME}/conf</code>.</p>
			
 
				-    </section>
			
 
				-    <section>
			
 
				-      <title>Cluster Restartability</title>
			
 
				-      <section>
			
 
				-        <title>Map/Reduce</title>
			
 
				-        <p>The job tracker restart can recover running jobs if 
			
 
				-        <code>mapred.jobtracker.restart.recover</code> is set true and 
			
 
				-        <a href="#Logging">JobHistory logging</a> is enabled. Also 
			
 
				-        <code>mapred.jobtracker.job.history.block.size</code> value should be 
			
 
				-        set to an optimal value to dump job history to disk as soon as 
			
 
				-        possible, the typical value is 3145728(3MB).</p>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Hadoop Rack Awareness</title>
			
 
				-      <p>The HDFS and the Map/Reduce components are rack-aware.</p>
			
 
				-      <p>The <code>NameNode</code> and the <code>JobTracker</code> obtains the
			
 
				-      <code>rack id</code> of the slaves in the cluster by invoking an API 
			
 
				-      <a href="ext:api/org/apache/hadoop/net/dnstoswitchmapping/resolve
			
 
				-      ">resolve</a> in an administrator configured
			
 
				-      module. The API resolves the slave's DNS name (also IP address) to a 
			
 
				-      rack id. What module to use can be configured using the configuration
			
 
				-      item <code>topology.node.switch.mapping.impl</code>. The default 
			
 
				-      implementation of the same runs a script/command configured using 
			
 
				-      <code>topology.script.file.name</code>. If topology.script.file.name is
			
 
				-      not set, the rack id <code>/default-rack</code> is returned for any 
			
 
				-      passed IP address. The additional configuration in the Map/Reduce
			
 
				-      part is <code>mapred.cache.task.levels</code> which determines the number
			
 
				-      of levels (in the network topology) of caches. So, for example, if it is
			
 
				-      the default value of 2, two levels of caches will be constructed - 
			
 
				-      one for hosts (host -> task mapping) and another for racks 
			
 
				-      (rack -> task mapping).
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Hadoop Startup</title>
			
 
				-      
			
 
				-      <p>To start a Hadoop cluster you will need to start both the HDFS and 
			
 
				-      Map/Reduce cluster.</p>
			
 
				-
			
 
				-      <p>
			
 
				-        Format a new distributed filesystem:<br/>
			
 
				-        <code>$ bin/hadoop namenode -format</code>
			
 
				-      </p>
			
 
				-      
			
 
				-      <p>
			
 
				-        Start the HDFS with the following command, run on the designated
			
 
				-        <code>NameNode</code>:<br/>
			
 
				-        <code>$ bin/start-dfs.sh</code>
			
 
				-      </p>
			
 
				-      <p>The <code>bin/start-dfs.sh</code> script also consults the 
			
 
				-      <code>${HADOOP_CONF_DIR}/slaves</code> file on the <code>NameNode</code> 
			
 
				-      and starts the <code>DataNode</code> daemon on all the listed slaves.</p>
			
 
				-      
			
 
				-      <p>
			
 
				-        Start Map-Reduce with the following command, run on the designated
			
 
				-        <code>JobTracker</code>:<br/>
			
 
				-        <code>$ bin/start-mapred.sh</code>
			
 
				-      </p>
			
 
				-      <p>The <code>bin/start-mapred.sh</code> script also consults the 
			
 
				-      <code>${HADOOP_CONF_DIR}/slaves</code> file on the <code>JobTracker</code> 
			
 
				-      and starts the <code>TaskTracker</code> daemon on all the listed slaves.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Hadoop Shutdown</title>
			
 
				-      
			
 
				-      <p>
			
 
				-        Stop HDFS with the following command, run on the designated 
			
 
				-        <code>NameNode</code>:<br/>
			
 
				-        <code>$ bin/stop-dfs.sh</code>
			
 
				-      </p>
			
 
				-      <p>The <code>bin/stop-dfs.sh</code> script also consults the 
			
 
				-      <code>${HADOOP_CONF_DIR}/slaves</code> file on the <code>NameNode</code> 
			
 
				-      and stops the <code>DataNode</code> daemon on all the listed slaves.</p>
			
 
				-      
			
 
				-      <p>
			
 
				-        Stop Map/Reduce with the following command, run on the designated
			
 
				-        the designated <code>JobTracker</code>:<br/>
			
 
				-        <code>$ bin/stop-mapred.sh</code><br/>
			
 
				-      </p>
			
 
				-      <p>The <code>bin/stop-mapred.sh</code> script also consults the 
			
 
				-      <code>${HADOOP_CONF_DIR}/slaves</code> file on the <code>JobTracker</code> 
			
 
				-      and stops the <code>TaskTracker</code> daemon on all the listed slaves.</p>
			
 
				-    </section>
			
 
				-  </body>
			
 
				-  
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/commands_manual.xml
+++ b/src/docs/src/documentation/content/xdocs/commands_manual.xml
@@ -1,732 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-<document>
			
 
				-	<header>
			
 
				-		<title>Commands Guide</title>
			
 
				-	</header>
			
 
				-	
			
 
				-	<body>
			
 
				-		<section>
			
 
				-			<title>Overview</title>
			
 
				-			<p>
			
 
				-				All hadoop commands are invoked by the bin/hadoop script. Running the hadoop
			
 
				-				script without any arguments prints the description for all commands.
			
 
				-			</p>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop [--config confdir] [COMMAND] [GENERIC_OPTIONS] [COMMAND_OPTIONS]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-				Hadoop has an option parsing framework that employs parsing generic options as well as running classes.
			
 
				-			</p>
			
 
				-			<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>--config confdir</code></td>
			
 
				-			            <td>Overwrites the default Configuration directory. Default is ${HADOOP_HOME}/conf.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>GENERIC_OPTIONS</code></td>
			
 
				-			            <td>The common set of options supported by multiple commands.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>COMMAND</code><br/><code>COMMAND_OPTIONS</code></td>
			
 
				-			            <td>Various commands with their options are described in the following sections. The commands 
			
 
				-			            have been grouped into <a href="commands_manual.html#User+Commands">User Commands</a> 
			
 
				-			            and <a href="commands_manual.html#Administration+Commands">Administration Commands</a>.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			 <section>
			
 
				-				<title>Generic Options</title>
			
 
				-				<p>
			
 
				-				  The following options are supported by <a href="commands_manual.html#dfsadmin">dfsadmin</a>, 
			
 
				-				  <a href="commands_manual.html#fs">fs</a>, <a href="commands_manual.html#fsck">fsck</a> and 
			
 
				-				  <a href="commands_manual.html#job">job</a>. 
			
 
				-				  Applications should implement 
			
 
				-				  <a href="ext:api/org/apache/hadoop/util/tool">Tool</a> to support
			
 
				-				  <a href="ext:api/org/apache/hadoop/util/genericoptionsparser">
			
 
				-				  GenericOptions</a>.
			
 
				-				</p>
			
 
				-			     <table>
			
 
				-			          <tr><th> GENERIC_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-conf &lt;configuration file&gt;</code></td>
			
 
				-			            <td>Specify an application configuration file.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-D &lt;property=value&gt;</code></td>
			
 
				-			            <td>Use value for given property.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-fs &lt;local|namenode:port&gt;</code></td>
			
 
				-			            <td>Specify a namenode.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-jt &lt;local|jobtracker:port&gt;</code></td>
			
 
				-			            <td>Specify a job tracker. Applies only to <a href="commands_manual.html#job">job</a>.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-files &lt;comma separated list of files&gt;</code></td>
			
 
				-			            <td>Specify comma separated files to be copied to the map reduce cluster. 
			
 
				-			            Applies only to <a href="commands_manual.html#job">job</a>.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-libjars &lt;comma seperated list of jars&gt;</code></td>
			
 
				-			            <td>Specify comma separated jar files to include in the classpath. 
			
 
				-			            Applies only to <a href="commands_manual.html#job">job</a>.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-archives &lt;comma separated list of archives&gt;</code></td>
			
 
				-			            <td>Specify comma separated archives to be unarchived on the compute machines. 
			
 
				-			            Applies only to <a href="commands_manual.html#job">job</a>.</td>
			
 
				-			           </tr>
			
 
				-				</table>
			
 
				-			</section>	   
			
 
				-		</section>
			
 
				-		
			
 
				-		<section>
			
 
				-			<title> User Commands </title>
			
 
				-			<p>Commands useful for users of a hadoop cluster.</p>
			
 
				-			<section>
			
 
				-				<title> archive </title>
			
 
				-				<p>
			
 
				-					Creates a hadoop archive. More information can be found at <a href="hadoop_archives.html">Hadoop Archives</a>.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop archive -archiveName NAME &lt;src&gt;* &lt;dest&gt;</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-					   <tr>
			
 
				-			          	<td><code>-archiveName NAME</code></td>
			
 
				-			            <td>Name of the archive to be created.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>src</code></td>
			
 
				-			            <td>Filesystem pathnames which work as usual with regular expressions.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>dest</code></td>
			
 
				-			            <td>Destination directory which would contain the archive.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> distcp </title>
			
 
				-				<p>
			
 
				-					Copy file or directories recursively. More information can be found at <a href="distcp.html">Hadoop DistCp Guide</a>.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop distcp &lt;srcurl&gt; &lt;desturl&gt;</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>srcurl</code></td>
			
 
				-			            <td>Source Url</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>desturl</code></td>
			
 
				-			            <td>Destination Url</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			       
			
 
				-			<section>
			
 
				-				<title> fs </title>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop fs [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] 
			
 
				-					[COMMAND_OPTIONS]</code>
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					Runs a generic filesystem user client.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					The various COMMAND_OPTIONS can be found at <a href="hdfs_shell.html">Hadoop FS Shell Guide</a>.
			
 
				-				</p>   
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> fsck </title>
			
 
				-				<p>
			
 
				-					Runs a HDFS filesystem checking utility. See <a href="hdfs_user_guide.html#Fsck">Fsck</a> for more info.
			
 
				-				</p> 
			
 
				-				<p><code>Usage: hadoop fsck [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] 
			
 
				-				&lt;path&gt; [-move | -delete | -openforwrite] [-files [-blocks 
			
 
				-				[-locations | -racks]]]</code></p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			          <tr>
			
 
				-			            <td><code>&lt;path&gt;</code></td>
			
 
				-			            <td>Start checking from this path.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-move</code></td>
			
 
				-			            <td>Move corrupted files to /lost+found</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-delete</code></td>
			
 
				-			            <td>Delete corrupted files.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-openforwrite</code></td>
			
 
				-			            <td>Print out files opened for write.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-files</code></td>
			
 
				-			            <td>Print out files being checked.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-blocks</code></td>
			
 
				-			            <td>Print out block report.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-locations</code></td>
			
 
				-			            <td>Print out locations for every block.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-racks</code></td>
			
 
				-			            <td>Print out network topology for data-node locations.</td>
			
 
				-			           </tr>
			
 
				-					</table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> jar </title>
			
 
				-				<p>
			
 
				-					Runs a jar file. Users can bundle their Map Reduce code in a jar file and execute it using this command.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop jar &lt;jar&gt; [mainClass] args...</code>
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					The streaming jobs are run via this command. Examples can be referred from 
			
 
				-					<a href="streaming.html#More+usage+examples">Streaming examples</a>
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					Word count example is also run using jar command. It can be referred from
			
 
				-					<a href="mapred_tutorial.html#Usage">Wordcount example</a>
			
 
				-				</p>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> job </title>
			
 
				-				<p>
			
 
				-					Command to interact with Map Reduce Jobs.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop job [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] 
			
 
				-					[-submit &lt;job-file&gt;] | [-status &lt;job-id&gt;] | 
			
 
				-					[-counter &lt;job-id&gt; &lt;group-name&gt; &lt;counter-name&gt;] | [-kill &lt;job-id&gt;] | 
			
 
				-					[-events &lt;job-id&gt; &lt;from-event-#&gt; &lt;#-of-events&gt;] | [-history [all] &lt;jobOutputDir&gt;] |
			
 
				-					[-list [all]] | [-kill-task &lt;task-id&gt;] | [-fail-task &lt;task-id&gt;] | 
			
 
				-          [-set-priority &lt;job-id&gt; &lt;priority&gt;]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-submit &lt;job-file&gt;</code></td>
			
 
				-			            <td>Submits the job.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-status &lt;job-id&gt;</code></td>
			
 
				-			            <td>Prints the map and reduce completion percentage and all job counters.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-counter &lt;job-id&gt; &lt;group-name&gt; &lt;counter-name&gt;</code></td>
			
 
				-			            <td>Prints the counter value.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-kill &lt;job-id&gt;</code></td>
			
 
				-			            <td>Kills the job.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-events &lt;job-id&gt; &lt;from-event-#&gt; &lt;#-of-events&gt;</code></td>
			
 
				-			            <td>Prints the events' details received by jobtracker for the given range.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-history [all] &lt;jobOutputDir&gt;</code></td>
			
 
				-			            <td>-history &lt;jobOutputDir&gt; prints job details, failed and killed tip details. More details 
			
 
				-			            about the job such as successful tasks and task attempts made for each task can be viewed by 
			
 
				-			            specifying the [all] option. </td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-list [all]</code></td>
			
 
				-			            <td>-list all displays all jobs. -list displays only jobs which are yet to complete.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-kill-task &lt;task-id&gt;</code></td>
			
 
				-			            <td>Kills the task. Killed tasks are NOT counted against failed attempts.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-fail-task &lt;task-id&gt;</code></td>
			
 
				-			            <td>Fails the task. Failed tasks are counted against failed attempts.</td>
			
 
				-			           </tr>
			
 
				-                 <tr>
			
 
				-                  <td><code>-set-priority &lt;job-id&gt; &lt;priority&gt;</code></td>
			
 
				-                  <td>Changes the priority of the job. 
			
 
				-                  Allowed priority values are VERY_HIGH, HIGH, NORMAL, LOW, VERY_LOW</td>
			
 
				-                 </tr>
			
 
				-					</table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> pipes </title>
			
 
				-				<p>
			
 
				-					Runs a pipes job.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop pipes [-conf &lt;path&gt;] [-jobconf &lt;key=value&gt;, &lt;key=value&gt;, ...] 
			
 
				-					[-input &lt;path&gt;] [-output &lt;path&gt;] [-jar &lt;jar file&gt;] [-inputformat &lt;class&gt;] 
			
 
				-					[-map &lt;class&gt;] [-partitioner &lt;class&gt;] [-reduce &lt;class&gt;] [-writer &lt;class&gt;] 
			
 
				-					[-program &lt;executable&gt;] [-reduces &lt;num&gt;] </code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			          <tr>
			
 
				-			          	<td><code>-conf &lt;path&gt;</code></td>
			
 
				-			            <td>Configuration for job</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-jobconf &lt;key=value&gt;, &lt;key=value&gt;, ...</code></td>
			
 
				-			            <td>Add/override configuration for job</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-input &lt;path&gt;</code></td>
			
 
				-			            <td>Input directory</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-output &lt;path&gt;</code></td>
			
 
				-			            <td>Output directory</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-jar &lt;jar file&gt;</code></td>
			
 
				-			            <td>Jar filename</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-inputformat &lt;class&gt;</code></td>
			
 
				-			            <td>InputFormat class</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-map &lt;class&gt;</code></td>
			
 
				-			            <td>Java Map class</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-partitioner &lt;class&gt;</code></td>
			
 
				-			            <td>Java Partitioner</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-reduce &lt;class&gt;</code></td>
			
 
				-			            <td>Java Reduce class</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-writer &lt;class&gt;</code></td>
			
 
				-			            <td>Java RecordWriter</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-program &lt;executable&gt;</code></td>
			
 
				-			            <td>Executable URI</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-reduces &lt;num&gt;</code></td>
			
 
				-			            <td>Number of reduces</td>
			
 
				-			           </tr>
			
 
				-					</table>
			
 
				-			</section>
			
 
				-      <section>
			
 
				-        <title> queue </title>
			
 
				-        <p>
			
 
				-          command to interact and view Job Queue information
			
 
				-        </p>
			
 
				-        <p>
			
 
				-          <code>Usage : hadoop queue [-list] | [-info &lt;job-queue-name&gt; [-showJobs]] | [-showacls]</code>
			
 
				-        </p>
			
 
				-        <table>
			
 
				-        <tr>
			
 
				-          <th> COMMAND_OPTION </th><th> Description </th>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td><code>-list</code> </td>
			
 
				-          <td>Gets list of Job Queues configured in the system. Along with scheduling information
			
 
				-          associated with the job queues.
			
 
				-          </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td><code>-info &lt;job-queue-name&gt; [-showJobs]</code></td>
			
 
				-          <td>
			
 
				-           Displays the job queue information and associated scheduling information of particular
			
 
				-           job queue. If -showJobs options is present a list of jobs submitted to the particular job
			
 
				-           queue is displayed. 
			
 
				-          </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td><code>-showacls</code></td>
			
 
				-          <td>Displays the queue name and associated queue operations allowed for the current user.
			
 
				-          The list consists of only those queues to which the user has access.
			
 
				-          </td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-      </section>  	
			
 
				-			<section>
			
 
				-				<title> version </title>
			
 
				-				<p>
			
 
				-					Prints the version.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop version</code>
			
 
				-				</p>
			
 
				-			</section>
			
 
				-			<section>
			
 
				-				<title> CLASSNAME </title>
			
 
				-				<p>
			
 
				-					 hadoop script can be used to invoke any class.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop CLASSNAME</code>
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					 Runs the class named CLASSNAME.
			
 
				-				</p>
			
 
				-			</section>
			
 
				-    </section>
			
 
				-		<section>
			
 
				-			<title> Administration Commands </title>
			
 
				-			<p>Commands useful for administrators of a hadoop cluster.</p>
			
 
				-			<section>
			
 
				-				<title> balancer </title>
			
 
				-				<p>
			
 
				-					Runs a cluster balancing utility. An administrator can simply press Ctrl-C to stop the 
			
 
				-					rebalancing process. See <a href="hdfs_user_guide.html#Rebalancer">Rebalancer</a> for more details.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop balancer [-threshold &lt;threshold&gt;]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-threshold &lt;threshold&gt;</code></td>
			
 
				-			            <td>Percentage of disk capacity. This overwrites the default threshold.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> daemonlog </title>
			
 
				-				<p>
			
 
				-					 Get/Set the log level for each daemon.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop daemonlog  -getlevel &lt;host:port&gt; &lt;name&gt;</code><br/>
			
 
				-					<code>Usage: hadoop daemonlog  -setlevel &lt;host:port&gt; &lt;name&gt; &lt;level&gt;</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-getlevel &lt;host:port&gt; &lt;name&gt;</code></td>
			
 
				-			            <td>Prints the log level of the daemon running at &lt;host:port&gt;. 
			
 
				-			            This command internally connects to http://&lt;host:port&gt;/logLevel?log=&lt;name&gt;</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-setlevel &lt;host:port&gt; &lt;name&gt; &lt;level&gt;</code></td>
			
 
				-			            <td>Sets the log level of the daemon running at &lt;host:port&gt;. 
			
 
				-			            This command internally connects to http://&lt;host:port&gt;/logLevel?log=&lt;name&gt;</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> datanode</title>
			
 
				-				<p>
			
 
				-					Runs a HDFS datanode.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop datanode [-rollback]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-rollback</code></td>
			
 
				-			            <td>Rollsback the datanode to the previous version. This should be used after stopping the datanode 
			
 
				-			            and distributing the old hadoop version.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> dfsadmin </title>
			
 
				-				<p>
			
 
				-					Runs a HDFS dfsadmin client.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop dfsadmin  [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] 
			
 
				-					[-report][-safemode enter | leave | get | wait] [-saveNamespace] [-restoreFailedStorage true|false|check] [-refreshNodes] [-finalizeUpgrade]
			
 
				-					[-upgradeProgress status | details | force] [-metasave filename] [-refreshServiceAcl] [-printTopology] [-setQuota &lt;quota&gt; &lt;dirname&gt;...&lt;dirname&gt;]
			
 
				-					[-clrQuota &lt;dirname&gt;...&lt;dirname&gt;] [-setSpaceQuota &lt;quota&gt; &lt;dirname&gt;...&lt;dirname&gt;] [-clrSpaceQuota &lt;dirname&gt;...&lt;dirname&gt;] [-help [cmd]]
			
 
				-					</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-report</code></td>
			
 
				-			            <td>Reports basic filesystem information and statistics.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-safemode enter | leave | get | wait</code></td>
			
 
				-			            <td>Safe mode maintenance command.
			
 
				-               Safe mode is a Namenode state in which it <br/>
			
 
				-               1.  does not accept changes to the name space (read-only) <br/> 
			
 
				-               2.  does not replicate or delete blocks. <br/>
			
 
				-               Safe mode is entered automatically at Namenode startup, and
			
 
				-               leaves safe mode automatically when the configured minimum
			
 
				-               percentage of blocks satisfies the minimum replication
			
 
				-               condition.  Safe mode can also be entered manually, but then
			
 
				-                it can only be turned off manually as well.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			              <td><code>-saveNamesapce</code></td>
			
 
				-			              <td>Save current namespace into storage directories and reset edits log.
			
 
				-        Requires superuser permissions and safe mode.
			
 
				-			              </td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			           		<td><code> -restoreFailedStorage</code></td>
			
 
				-			           		<td>  Set/Unset/Check flag to attempt restore of failed storage replicas if they become available.
			
 
				-        Requires superuser permissions.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			                <td><code>-refreshServiceAcl</code></td>
			
 
				-			                <td> Reload the service-level authorization policy file
			
 
				-        Namenode will reload the authorization policy file</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			           		<td><code>-setSpaceQuota &lt;quota> &lt;dirname&gt;...&lt;dirname&gt;</code></td>
			
 
				-			           		<td>Set the disk space quota &lt;quota&gt; for each directory &lt;dirName&gt;. The directory quota is a long integer that puts a hard limit
			
 
				-        on the number of names in the directory tree.
			
 
				-        Quota can also be speciefied with a binary prefix for terabytes,
			
 
				-        petabytes etc (e.g. 50t is 50TB, 5m is 5MB, 3p is 3PB).<br/>
			
 
				-        For each directory, attempt to set the quota. An error will be reported if<br/>
			
 
				-        1. N is not a positive integer, or<br/>
			
 
				-        2. user is not an administrator, or<br/>
			
 
				-        3. the directory does not exist or is a file, or<br/>
			
 
				-        4. the directory would immediately exceed the new space quota.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			           		<td><code>-clrSpaceQuota &lt;dirname&gt;...&lt;dirname&gt;</code></td>
			
 
				-			           		<td> Clear the disk space quota for each directory &lt;dirName&gt;.
			
 
				-        For each directory, attempt to set the quota. An error will be reported if<br/>
			
 
				-        1. the directory does not exist or is a file, or<br/>
			
 
				-        2. user is not an administrator.<br/>
			
 
				-        It does not fault if the directory has no quota.</td>
			
 
				-			           </tr>
			
 
				-			            <tr>
			
 
				-			          	<td><code>-setQuota &lt;quota&gt; &lt;dirname&gt;...&lt;dirname&gt;</code></td>
			
 
				-			            <td>Set the quota &lt;quota&gt; for each directory &lt;dirname&gt;.
			
 
				-         The directory quota is a long integer that puts a hard limit on the number of names in the directory tree.<br/>
			
 
				-         For each directory, attempt to set the quota. An error will be reported if<br/>
			
 
				-         1. N is not a positive integer, or<br/>
			
 
				-         2. user is not an administrator, or<br/>
			
 
				-         3. the directory does not exist or is a file, or<br/>
			
 
				-         4. the directory would immediately exceed the new quota.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-clrQuota &lt;dirname&gt;...&lt;dirname&gt;</code></td>
			
 
				-			            <td>Clear the quota for each directory &lt;dirname&gt;.<br/>
			
 
				-        For each directory, attempt to set the quota. An error will be reported if<br/>
			
 
				-        1. the directory does not exist or is a file, or<br/>
			
 
				-        2. user is not an administrator.<br/>
			
 
				-        It does not fault if the directory has no quota.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-refreshNodes</code></td>
			
 
				-			            <td>Re-read the hosts and exclude files to update the set
			
 
				-                of Datanodes that are allowed to connect to the Namenode
			
 
				-                and those that should be decommissioned or recommissioned.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-finalizeUpgrade</code></td>
			
 
				-			            <td>Finalize upgrade of HDFS.
			
 
				-                Datanodes delete their previous version working directories,
			
 
				-                followed by Namenode doing the same.
			
 
				-                This completes the upgrade process.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-printTopology</code></td>
			
 
				-			            <td>Print a tree of the rack/datanode topology of the
			
 
				-                 cluster as seen by the NameNode.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-upgradeProgress status | details | force</code></td>
			
 
				-			            <td>Request current distributed upgrade status,
			
 
				-                a detailed status or force the upgrade to proceed.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-metasave filename</code></td>
			
 
				-			            <td>Save Namenode's primary data structures
			
 
				-                to &lt;filename&gt; in the directory specified by hadoop.log.dir property.
			
 
				-                &lt;filename&gt; will contain one line for each of the following <br/>
			
 
				-                        1. Datanodes heart beating with Namenode<br/>
			
 
				-                        2. Blocks waiting to be replicated<br/>
			
 
				-                        3. Blocks currrently being replicated<br/>
			
 
				-                        4. Blocks waiting to be deleted</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-restoreFailedStorage true | false | check</code></td>
			
 
				-			            <td>This option will turn on/off automatic attempt to restore failed storage replicas. 
			
 
				-			            If a failed storage becomes available again the system will attempt to restore 
			
 
				-			            edits and/or fsimage during checkpoint. 'check' option will return current setting.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-help [cmd]</code></td>
			
 
				-			            <td> Displays help for the given command or all commands if none
			
 
				-                is specified.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			<section>
			
 
				-        <title>mradmin</title>
			
 
				-        <p>Runs MR admin client</p>
			
 
				-        <p><code>Usage: hadoop mradmin  [</code>
			
 
				-        <a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a>
			
 
				-        <code>] [-refreshQueueAcls] </code></p>
			
 
				-        <table>
			
 
				-        <tr>
			
 
				-        <th> COMMAND_OPTION </th><th> Description </th>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td><code>-refreshQueueAcls</code></td>
			
 
				-        <td> Refresh the queue acls used by hadoop, to check access during submissions
			
 
				-        and administration of the job by the user. The properties present in
			
 
				-        <code>mapred-queue-acls.xml</code> is reloaded by the queue manager.</td>
			
 
				-        </tr>
			
 
				-        </table>
			
 
				-      </section>
			
 
				-			<section>
			
 
				-				<title> jobtracker </title>
			
 
				-				<p>
			
 
				-					Runs the MapReduce job Tracker node.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop jobtracker</code>
			
 
				-				</p>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> namenode </title>
			
 
				-				<p>
			
 
				-					Runs the namenode. More info about the upgrade, rollback and finalize is at 
			
 
				-					<a href="hdfs_user_guide.html#Upgrade+and+Rollback">Upgrade Rollback</a>
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop namenode [-format] | [-upgrade] | [-rollback] | [-finalize] | [-importCheckpoint]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-                <tr>
			
 
				-                  <td><code>-regular</code></td>
			
 
				-                  <td>Start namenode in standard, active role rather than as backup or checkpoint node. This is the default role.</td>
			
 
				-                </tr>
			
 
				-                <tr>
			
 
				-                  <td><code>-checkpoint</code></td>
			
 
				-                  <td>Start namenode in checkpoint role, creating periodic checkpoints of the active namenode metadata.</td>
			
 
				-                </tr>
			
 
				-                <tr>
			
 
				-                  <td><code>-backup</code></td>
			
 
				-                  <td>Start namenode in backup role, maintaining an up-to-date in-memory copy of the namespace and creating periodic checkpoints.</td>
			
 
				-                </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-format</code></td>
			
 
				-			            <td>Formats the namenode. It starts the namenode, formats it and then shut it down.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-upgrade</code></td>
			
 
				-			            <td>Namenode should be started with upgrade option after the distribution of new hadoop version.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-rollback</code></td>
			
 
				-			            <td>Rollsback the namenode to the previous version. This should be used after stopping the cluster 
			
 
				-			            and distributing the old hadoop version.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-finalize</code></td>
			
 
				-			            <td>Finalize will remove the previous state of the files system. Recent upgrade will become permanent. 
			
 
				-			            Rollback option will not be available anymore. After finalization it shuts the namenode down.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-importCheckpoint</code></td>
			
 
				-			            <td>Loads image from a checkpoint directory and saves it into the current one. Checkpoint directory 
			
 
				-			            is read from property fs.checkpoint.dir</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> secondarynamenode </title>
			
 
				-				<p>
			
 
				-					Use of the Secondary NameNode has been deprecated. Instead, consider using a 
			
 
				-					<a href="hdfs_user_guide.html#Checkpoint+node">Checkpoint node</a> or 
			
 
				-					<a href="hdfs_user_guide.html#Backup+node">Backup node</a>. Runs the HDFS secondary 
			
 
				-					namenode. See <a href="hdfs_user_guide.html#Secondary+NameNode">Secondary NameNode</a> 
			
 
				-					for more info.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop secondarynamenode [-checkpoint [force]] | [-geteditsize]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-checkpoint [force]</code></td>
			
 
				-			            <td>Checkpoints the Secondary namenode if EditLog size >= fs.checkpoint.size. 
			
 
				-			            If -force is used, checkpoint irrespective of EditLog size.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-geteditsize</code></td>
			
 
				-			            <td>Prints the EditLog size.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> tasktracker </title>
			
 
				-				<p>
			
 
				-					Runs a MapReduce task Tracker node.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop tasktracker</code>
			
 
				-				</p>
			
 
				-			</section>
			
 
				-			
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-		      
			
 
				-
			
 
				-	</body>
			
 
				-</document>      
			
--- a/src/docs/src/documentation/content/xdocs/distcp.xml
+++ b/src/docs/src/documentation/content/xdocs/distcp.xml
@@ -1,352 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-<document>
			
 
				-
			
 
				-  <header>
			
 
				-    <title>DistCp Guide</title>
			
 
				-  </header>
			
 
				-
			
 
				-  <body>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Overview</title>
			
 
				-
			
 
				-      <p>DistCp (distributed copy) is a tool used for large inter/intra-cluster
			
 
				-      copying. It uses Map/Reduce to effect its distribution, error
			
 
				-      handling and recovery, and reporting. It expands a list of files and
			
 
				-      directories into input to map tasks, each of which will copy a partition
			
 
				-      of the files specified in the source list. Its Map/Reduce pedigree has
			
 
				-      endowed it with some quirks in both its semantics and execution. The
			
 
				-      purpose of this document is to offer guidance for common tasks and to
			
 
				-      elucidate its model.</p>
			
 
				-
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Usage</title>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Basic</title>
			
 
				-        <p>The most common invocation of DistCp is an inter-cluster copy:</p>
			
 
				-        <p><code>bash$ hadoop distcp hdfs://nn1:8020/foo/bar \</code><br/>
			
 
				-           <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-                 hdfs://nn2:8020/bar/foo</code></p>
			
 
				-
			
 
				-        <p>This will expand the namespace under <code>/foo/bar</code> on nn1
			
 
				-        into a temporary file, partition its contents among a set of map
			
 
				-        tasks, and start a copy on each TaskTracker from nn1 to nn2. Note
			
 
				-        that DistCp expects absolute paths.</p>
			
 
				-
			
 
				-        <p>One can also specify multiple source directories on the command
			
 
				-        line:</p>
			
 
				-        <p><code>bash$ hadoop distcp hdfs://nn1:8020/foo/a \</code><br/>
			
 
				-           <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-                 hdfs://nn1:8020/foo/b \</code><br/>
			
 
				-           <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-                 hdfs://nn2:8020/bar/foo</code></p>
			
 
				-
			
 
				-        <p>Or, equivalently, from a file using the <code>-f</code> option:<br/>
			
 
				-        <code>bash$ hadoop distcp -f hdfs://nn1:8020/srclist \</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              &nbsp;hdfs://nn2:8020/bar/foo</code><br/></p>
			
 
				-
			
 
				-        <p>Where <code>srclist</code> contains<br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</code></p>
			
 
				-
			
 
				-        <p>When copying from multiple sources, DistCp will abort the copy with
			
 
				-        an error message if two sources collide, but collisions at the
			
 
				-        destination are resolved per the <a href="#options">options</a>
			
 
				-        specified. By default, files already existing at the destination are
			
 
				-        skipped (i.e. not replaced by the source file). A count of skipped
			
 
				-        files is reported at the end of each job, but it may be inaccurate if a
			
 
				-        copier failed for some subset of its files, but succeeded on a later
			
 
				-        attempt (see <a href="#etc">Appendix</a>).</p>
			
 
				-
			
 
				-        <p>It is important that each TaskTracker can reach and communicate with
			
 
				-        both the source and destination file systems. For HDFS, both the source
			
 
				-        and destination must be running the same version of the protocol or use
			
 
				-        a backwards-compatible protocol (see <a href="#cpver">Copying Between
			
 
				-        Versions</a>).</p>
			
 
				-
			
 
				-        <p>After a copy, it is recommended that one generates and cross-checks
			
 
				-        a listing of the source and destination to verify that the copy was
			
 
				-        truly successful. Since DistCp employs both Map/Reduce and the
			
 
				-        FileSystem API, issues in or between any of the three could adversely
			
 
				-        and silently affect the copy. Some have had success running with
			
 
				-        <code>-update</code> enabled to perform a second pass, but users should
			
 
				-        be acquainted with its semantics before attempting this.</p>
			
 
				-
			
 
				-        <p>It's also worth noting that if another client is still writing to a
			
 
				-        source file, the copy will likely fail. Attempting to overwrite a file
			
 
				-        being written at the destination should also fail on HDFS. If a source
			
 
				-        file is (re)moved before it is copied, the copy will fail with a
			
 
				-        FileNotFoundException.</p>
			
 
				-
			
 
				-      </section> <!-- Basic -->
			
 
				-
			
 
				-      <section id="options">
			
 
				-        <title>Options</title>
			
 
				-
			
 
				-        <section>
			
 
				-        <title>Option Index</title>
			
 
				-        <table>
			
 
				-          <tr><th> Flag </th><th> Description </th><th> Notes </th></tr>
			
 
				-
			
 
				-          <tr><td><code>-p[rbugp]</code></td>
			
 
				-              <td>Preserve<br/>
			
 
				-                  &nbsp;&nbsp;r: replication number<br/>
			
 
				-                  &nbsp;&nbsp;b: block size<br/>
			
 
				-                  &nbsp;&nbsp;u: user<br/>
			
 
				-                  &nbsp;&nbsp;g: group<br/>
			
 
				-                  &nbsp;&nbsp;p: permission<br/></td>
			
 
				-              <td>Modification times are not preserved. Also, when
			
 
				-              <code>-update</code> is specified, status updates will
			
 
				-              <strong>not</strong> be synchronized unless the file sizes
			
 
				-              also differ (i.e. unless the file is re-created).
			
 
				-              </td></tr>
			
 
				-          <tr><td><code>-i</code></td>
			
 
				-              <td>Ignore failures</td>
			
 
				-              <td>As explained in the <a href="#etc">Appendix</a>, this option
			
 
				-              will keep more accurate statistics about the copy than the
			
 
				-              default case. It also preserves logs from failed copies, which
			
 
				-              can be valuable for debugging. Finally, a failing map will not
			
 
				-              cause the job to fail before all splits are attempted.
			
 
				-              </td></tr>
			
 
				-          <tr><td><code>-log &lt;logdir&gt;</code></td>
			
 
				-              <td>Write logs to &lt;logdir&gt;</td>
			
 
				-              <td>DistCp keeps logs of each file it attempts to copy as map
			
 
				-              output. If a map fails, the log output will not be retained if
			
 
				-              it is re-executed.
			
 
				-              </td></tr>
			
 
				-          <tr><td><code>-m &lt;num_maps&gt;</code></td>
			
 
				-              <td>Maximum number of simultaneous copies</td>
			
 
				-              <td>Specify the number of maps to copy data. Note that more maps
			
 
				-              may not necessarily improve throughput.
			
 
				-              </td></tr>
			
 
				-          <tr><td><code>-overwrite</code></td>
			
 
				-              <td>Overwrite destination</td>
			
 
				-              <td>If a map fails and <code>-i</code> is not specified, all the
			
 
				-              files in the split, not only those that failed, will be recopied.
			
 
				-              As discussed in the <a href="#uo">following</a>, it also changes
			
 
				-              the semantics for generating destination paths, so users should
			
 
				-              use this carefully.
			
 
				-              </td></tr>
			
 
				-          <tr><td><code>-update</code></td>
			
 
				-              <td>Overwrite if src size different from dst size</td>
			
 
				-              <td>As noted in the preceding, this is not a &quot;sync&quot;
			
 
				-              operation. The only criterion examined is the source and
			
 
				-              destination file sizes; if they differ, the source file
			
 
				-              replaces the destination file. As discussed in the
			
 
				-              <a href="#uo">following</a>, it also changes the semantics for
			
 
				-              generating destination paths, so users should use this carefully.
			
 
				-              </td></tr>
			
 
				-          <tr><td><code>-f &lt;urilist_uri&gt;</code></td>
			
 
				-              <td>Use list at &lt;urilist_uri&gt; as src list</td>
			
 
				-              <td>This is equivalent to listing each source on the command
			
 
				-              line. The <code>urilist_uri</code> list should be a fully
			
 
				-              qualified URI.
			
 
				-              </td></tr>
			
 
				-          <tr><td><code>-filelimit &lt;n&gt;</code></td>
			
 
				-              <td>Limit the total number of files to be &lt;= n</td>
			
 
				-              <td>See also <a href="#Symbolic-Representations">Symbolic
			
 
				-                  Representations</a>.
			
 
				-              </td></tr>
			
 
				-          <tr><td><code>-sizelimit &lt;n&gt;</code></td>
			
 
				-              <td>Limit the total size to be &lt;= n bytes</td>
			
 
				-              <td>See also <a href="#Symbolic-Representations">Symbolic
			
 
				-                  Representations</a>.
			
 
				-              </td></tr>
			
 
				-          <tr><td><code>-delete</code></td>
			
 
				-              <td>Delete the files existing in the dst but not in src</td>
			
 
				-              <td>The deletion is done by FS Shell.  So the trash will be used,
			
 
				-                  if it is enable.
			
 
				-              </td></tr>
			
 
				-
			
 
				-        </table>
			
 
				-
			
 
				-      </section>
			
 
				-
			
 
				-      <section id="Symbolic-Representations">
			
 
				-        <title>Symbolic Representations</title>
			
 
				-        <p>
			
 
				-        The parameter &lt;n&gt; in <code>-filelimit</code>
			
 
				-        and <code>-sizelimit</code> can be specified with symbolic
			
 
				-        representation.  For examples,
			
 
				-        </p>
			
 
				-        <ul>
			
 
				-          <li>1230k = 1230 * 1024 = 1259520</li>
			
 
				-          <li>891g = 891 * 1024^3 = 956703965184</li>
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-
			
 
				-      <section id="uo">
			
 
				-        <title>Update and Overwrite</title>
			
 
				-
			
 
				-        <p>It's worth giving some examples of <code>-update</code> and
			
 
				-        <code>-overwrite</code>. Consider a copy from <code>/foo/a</code> and
			
 
				-        <code>/foo/b</code> to <code>/bar/foo</code>, where the sources contain
			
 
				-        the following:</p>
			
 
				-
			
 
				-        <p><code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/aa</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/ab</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ba</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ab</code></p>
			
 
				-
			
 
				-        <p>If either <code>-update</code> or <code>-overwrite</code> is set,
			
 
				-        then both sources will map an entry to <code>/bar/foo/ab</code> at the
			
 
				-        destination. For both options, the contents of each source directory
			
 
				-        are compared with the <strong>contents</strong> of the destination
			
 
				-        directory. Rather than permit this conflict, DistCp will abort.</p>
			
 
				-
			
 
				-        <p>In the default case, both <code>/bar/foo/a</code> and
			
 
				-        <code>/bar/foo/b</code> will be created and neither will collide.</p>
			
 
				-
			
 
				-        <p>Now consider a legal copy using <code>-update</code>:<br/>
			
 
				-        <code>distcp -update hdfs://nn1:8020/foo/a \</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              hdfs://nn1:8020/foo/b \</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              hdfs://nn2:8020/bar</code></p>
			
 
				-
			
 
				-        <p>With sources/sizes:</p>
			
 
				-
			
 
				-        <p><code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/aa 32</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/ab 32</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ba 64</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/bb 32</code></p>
			
 
				-
			
 
				-        <p>And destination/sizes:</p>
			
 
				-
			
 
				-        <p><code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/aa 32</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ba 32</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/bb 64</code></p>
			
 
				-
			
 
				-        <p>Will effect:</p>
			
 
				-
			
 
				-        <p><code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/aa 32</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ab 32</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ba 64</code><br/>
			
 
				-        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/bb 32</code></p>
			
 
				-
			
 
				-        <p>Only <code>aa</code> is not overwritten on nn2. If
			
 
				-        <code>-overwrite</code> were specified, all elements would be
			
 
				-        overwritten.</p>
			
 
				-
			
 
				-      </section> <!-- Update and Overwrite -->
			
 
				-
			
 
				-      </section> <!-- Options -->
			
 
				-
			
 
				-    </section> <!-- Usage -->
			
 
				-
			
 
				-    <section id="etc">
			
 
				-      <title>Appendix</title>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Map sizing</title>
			
 
				-
			
 
				-          <p>DistCp makes a faint attempt to size each map comparably so that
			
 
				-          each copies roughly the same number of bytes. Note that files are the
			
 
				-          finest level of granularity, so increasing the number of simultaneous
			
 
				-          copiers (i.e. maps) may not always increase the number of
			
 
				-          simultaneous copies nor the overall throughput.</p>
			
 
				-
			
 
				-          <p>If <code>-m</code> is not specified, DistCp will attempt to
			
 
				-          schedule work for <code>min (total_bytes / bytes.per.map, 20 *
			
 
				-          num_task_trackers)</code> where <code>bytes.per.map</code> defaults
			
 
				-          to 256MB.</p>
			
 
				-
			
 
				-          <p>Tuning the number of maps to the size of the source and
			
 
				-          destination clusters, the size of the copy, and the available
			
 
				-          bandwidth is recommended for long-running and regularly run jobs.</p>
			
 
				-
			
 
				-      </section>
			
 
				-
			
 
				-      <section id="cpver">
			
 
				-        <title>Copying between versions of HDFS</title>
			
 
				-
			
 
				-        <p>For copying between two different versions of Hadoop, one will
			
 
				-        usually use HftpFileSystem. This is a read-only FileSystem, so DistCp
			
 
				-        must be run on the destination cluster (more specifically, on
			
 
				-        TaskTrackers that can write to the destination cluster). Each source is
			
 
				-        specified as <code>hftp://&lt;dfs.http.address&gt;/&lt;path&gt;</code>
			
 
				-        (the default <code>dfs.http.address</code> is
			
 
				-        &lt;namenode&gt;:50070).</p>
			
 
				-
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Map/Reduce and other side-effects</title>
			
 
				-
			
 
				-        <p>As has been mentioned in the preceding, should a map fail to copy
			
 
				-        one of its inputs, there will be several side-effects.</p>
			
 
				-
			
 
				-        <ul>
			
 
				-
			
 
				-          <li>Unless <code>-i</code> is specified, the logs generated by that
			
 
				-          task attempt will be replaced by the previous attempt.</li>
			
 
				-
			
 
				-          <li>Unless <code>-overwrite</code> is specified, files successfully
			
 
				-          copied by a previous map on a re-execution will be marked as
			
 
				-          &quot;skipped&quot;.</li>
			
 
				-
			
 
				-          <li>If a map fails <code>mapred.map.max.attempts</code> times, the
			
 
				-          remaining map tasks will be killed (unless <code>-i</code> is
			
 
				-          set).</li>
			
 
				-
			
 
				-          <li>If <code>mapred.speculative.execution</code> is set set
			
 
				-          <code>final</code> and <code>true</code>, the result of the copy is
			
 
				-          undefined.</li>
			
 
				-
			
 
				-        </ul>
			
 
				-
			
 
				-      </section>
			
 
				-
			
 
				-      <!--
			
 
				-      <section>
			
 
				-        <title>Firewalls and SSL</title>
			
 
				-
			
 
				-        <p>To copy over HTTP, use the HftpFileSystem as described in the
			
 
				-        preceding <a href="#cpver">section</a>, and ensure that the required
			
 
				-        port(s) are open.</p>
			
 
				-
			
 
				-        <p>TODO</p>
			
 
				-
			
 
				-      </section>
			
 
				-      -->
			
 
				-
			
 
				-    </section> <!-- Appendix -->
			
 
				-
			
 
				-  </body>
			
 
				-
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/fair_scheduler.xml
+++ b/src/docs/src/documentation/content/xdocs/fair_scheduler.xml
@@ -1,371 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-  <!--
			
 
				-    Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-    contributor license agreements. See the NOTICE file distributed with
			
 
				-    this work for additional information regarding copyright ownership.
			
 
				-    The ASF licenses this file to You under the Apache License, Version
			
 
				-    2.0 (the "License"); you may not use this file except in compliance
			
 
				-    with the License. You may obtain a copy of the License at
			
 
				-
			
 
				-    http://www.apache.org/licenses/LICENSE-2.0 Unless required by
			
 
				-    applicable law or agreed to in writing, software distributed under
			
 
				-    the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
			
 
				-    OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				-    License for the specific language governing permissions and
			
 
				-    limitations under the License.
			
 
				-  -->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-<document>
			
 
				-  <header>
			
 
				-    <title>Fair Scheduler Guide</title>
			
 
				-  </header>
			
 
				-  <body>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Purpose</title>
			
 
				-
			
 
				-      <p>This document describes the Fair Scheduler, a pluggable
			
 
				-        Map/Reduce scheduler for Hadoop which provides a way to share
			
 
				-        large clusters.</p>
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Introduction</title>
			
 
				-      <p>Fair scheduling is a method of assigning resources to jobs
			
 
				-        such that all jobs get, on average, an equal share of resources
			
 
				-        over time. When there is a single job running, that job uses the
			
 
				-        entire cluster. When other jobs are submitted, tasks slots that
			
 
				-        free up are assigned to the new jobs, so that each job gets
			
 
				-        roughly the same amount of CPU time. Unlike the default Hadoop
			
 
				-        scheduler, which forms a queue of jobs, this lets short jobs finish
			
 
				-        in reasonable time while not starving long jobs. It is also a 
			
 
				-        reasonable way to share a cluster between a number of users. Finally, 
			
 
				-        fair sharing can also work with job priorities - the priorities are
			
 
				-        used as weights to determine the fraction of total compute time that
			
 
				-        each job should get.
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        The scheduler actually organizes jobs further into "pools", and 
			
 
				-        shares resources fairly between these pools. By default, there is a 
			
 
				-        separate pool for each user, so that each user gets the same share 
			
 
				-        of the cluster no matter how many jobs they submit. However, it is 
			
 
				-        also possible to set a job's pool based on the user's Unix group or
			
 
				-        any other jobconf property, such as the queue name property used by 
			
 
				-        <a href="capacity_scheduler.html">Capacity Scheduler</a>. 
			
 
				-        Within each pool, fair sharing is used to share capacity between 
			
 
				-        the running jobs. Pools can also be given weights to share the 
			
 
				-        cluster non-proportionally in the config file.
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        In addition to providing fair sharing, the Fair Scheduler allows
			
 
				-        assigning guaranteed minimum shares to pools, which is useful for
			
 
				-        ensuring that certain users, groups or production applications
			
 
				-        always get sufficient resources. When a pool contains jobs, it gets
			
 
				-        at least its minimum share, but when the pool does not need its full
			
 
				-        guaranteed share, the excess is split between other running jobs.
			
 
				-        This lets the scheduler guarantee capacity for pools while utilizing
			
 
				-        resources efficiently when these pools don't contain jobs.       
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        The Fair Scheduler lets all jobs run by default, but it is also
			
 
				-        possible to limit the number of running jobs per user and per pool
			
 
				-        through the config file. This can be useful when a user must submit
			
 
				-        hundreds of jobs at once, or in general to improve performance if
			
 
				-        running too many jobs at once would cause too much intermediate data
			
 
				-        to be created or too much context-switching. Limiting the jobs does
			
 
				-        not cause any subsequently submitted jobs to fail, only to wait in the
			
 
				-        sheduler's queue until some of the user's earlier jobs finish. Jobs to
			
 
				-        run from each user/pool are chosen in order of priority and then
			
 
				-        submit time, as in the default FIFO scheduler in Hadoop.
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        Finally, the fair scheduler provides several extension points where
			
 
				-        the basic functionality can be extended. For example, the weight
			
 
				-        calculation can be modified to give a priority boost to new jobs,
			
 
				-        implementing a "shortest job first" policy which reduces response
			
 
				-        times for interactive jobs even further.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Installation</title>
			
 
				-      <p>
			
 
				-        To run the fair scheduler in your Hadoop installation, you need to put
			
 
				-        it on the CLASSPATH. The easiest way is to copy the 
			
 
				-        <em>hadoop-*-fairscheduler.jar</em> from
			
 
				-        <em>HADOOP_HOME/contrib/fairscheduler</em> to <em>HADOOP_HOME/lib</em>.
			
 
				-        Alternatively you can modify <em>HADOOP_CLASSPATH</em> to include this jar, in
			
 
				-        <em>HADOOP_CONF_DIR/hadoop-env.sh</em>
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        In order to compile fair scheduler, from sources execute <em> ant 
			
 
				-        package</em> in source folder and copy the 
			
 
				-        <em>build/contrib/fair-scheduler/hadoop-*-fairscheduler.jar</em> 
			
 
				-        to <em>HADOOP_HOME/lib</em>
			
 
				-      </p>
			
 
				-      <p>
			
 
				-       You will also need to set the following property in the Hadoop config 
			
 
				-       file  <em>HADOOP_CONF_DIR/mapred-site.xml</em> to have Hadoop use 
			
 
				-       the fair scheduler: <br/>
			
 
				-       <code>&lt;property&gt;</code><br/> 
			
 
				-       <code>&nbsp;&nbsp;&lt;name&gt;mapred.jobtracker.taskScheduler&lt;/name&gt;</code><br/>
			
 
				-       <code>&nbsp;&nbsp;&lt;value&gt;org.apache.hadoop.mapred.FairScheduler&lt;/value&gt;</code><br/>
			
 
				-       <code>&lt;/property&gt;</code>
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        Once you restart the cluster, you can check that the fair scheduler 
			
 
				-        is running by going to http://&lt;jobtracker URL&gt;/scheduler 
			
 
				-        on the JobTracker's web UI. A &quot;job scheduler administration&quot; page should 
			
 
				-        be visible there. This page is described in the Administration section.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Configuring the Fair scheduler</title>
			
 
				-      <p>
			
 
				-      The following properties can be set in mapred-site.xml to configure 
			
 
				-      the fair scheduler:
			
 
				-      </p>
			
 
				-      <table>
			
 
				-        <tr>
			
 
				-        <th>Name</th><th>Description</th>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td>
			
 
				-          mapred.fairscheduler.allocation.file
			
 
				-        </td>
			
 
				-        <td>
			
 
				-          Specifies an absolute path to an XML file which contains the 
			
 
				-          allocations for each pool, as well as the per-pool and per-user 
			
 
				-          limits on number of running jobs. If this property is not 
			
 
				-          provided, allocations are not used.<br/>
			
 
				-          This file must be in XML format, and can contain three types of 
			
 
				-          elements:
			
 
				-          <ul>
			
 
				-          <li>pool elements, which may contain elements for minMaps, 
			
 
				-          minReduces, maxRunningJobs (limit the number of jobs from the 
			
 
				-          pool to run at once),and weight (to share the cluster 
			
 
				-          non-proportionally with other pools).
			
 
				-          </li>
			
 
				-          <li>user elements, which may contain a maxRunningJobs to limit 
			
 
				-          jobs. Note that by default, there is a separate pool for each 
			
 
				-          user, so these may not be necessary; they are useful, however, 
			
 
				-          if you create a pool per user group or manually assign jobs 
			
 
				-          to pools.</li>
			
 
				-          <li>A userMaxJobsDefault element, which sets the default running 
			
 
				-          job limit for any users whose limit is not specified.</li>
			
 
				-          </ul>
			
 
				-          <br/>
			
 
				-          Example Allocation file is listed below :<br/>
			
 
				-          <code>&lt;?xml version="1.0"?&gt; </code> <br/>
			
 
				-          <code>&lt;allocations&gt;</code> <br/> 
			
 
				-          <code>&nbsp;&nbsp;&lt;pool name="sample_pool"&gt;</code><br/>
			
 
				-          <code>&nbsp;&nbsp;&nbsp;&nbsp;&lt;minMaps&gt;5&lt;/minMaps&gt;</code><br/>
			
 
				-          <code>&nbsp;&nbsp;&nbsp;&nbsp;&lt;minReduces&gt;5&lt;/minReduces&gt;</code><br/>
			
 
				-          <code>&nbsp;&nbsp;&nbsp;&nbsp;&lt;weight&gt;2.0&lt;/weight&gt;</code><br/>
			
 
				-          <code>&nbsp;&nbsp;&lt;/pool&gt;</code><br/>
			
 
				-          <code>&nbsp;&nbsp;&lt;user name="sample_user"&gt;</code><br/>
			
 
				-          <code>&nbsp;&nbsp;&nbsp;&nbsp;&lt;maxRunningJobs&gt;6&lt;/maxRunningJobs&gt;</code><br/>
			
 
				-          <code>&nbsp;&nbsp;&lt;/user&gt;</code><br/>
			
 
				-          <code>&nbsp;&nbsp;&lt;userMaxJobsDefault&gt;3&lt;/userMaxJobsDefault&gt;</code><br/>
			
 
				-          <code>&lt;/allocations&gt;</code>
			
 
				-          <br/>
			
 
				-          This example creates a pool sample_pool with a guarantee of 5 map 
			
 
				-          slots and 5 reduce slots. The pool also has a weight of 2.0, meaning 
			
 
				-          it has a 2x higher share of the cluster than other pools (the default 
			
 
				-          weight is 1). Finally, the example limits the number of running jobs 
			
 
				-          per user to 3, except for sample_user, who can run 6 jobs concurrently. 
			
 
				-          Any pool not defined in the allocations file will have no guaranteed 
			
 
				-          capacity and a weight of 1.0. Also, any pool or user with no max 
			
 
				-          running jobs set in the file will be allowed to run an unlimited 
			
 
				-          number of jobs.
			
 
				-        </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td>
			
 
				-          mapred.fairscheduler.assignmultiple
			
 
				-        </td>
			
 
				-        <td>
			
 
				-          Allows the scheduler to assign both a map task and a reduce task 
			
 
				-          on each heartbeat, which improves cluster throughput when there 
			
 
				-          are many small tasks to run. Boolean value, default: true.
			
 
				-        </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td>
			
 
				-          mapred.fairscheduler.sizebasedweight
			
 
				-        </td>
			
 
				-        <td>
			
 
				-          Take into account job sizes in calculating their weights for fair 
			
 
				-          sharing.By default, weights are only based on job priorities. 
			
 
				-          Setting this flag to true will make them based on the size of the 
			
 
				-          job (number of tasks needed) as well,though not linearly 
			
 
				-          (the weight will be proportional to the log of the number of tasks 
			
 
				-          needed). This lets larger jobs get larger fair shares while still 
			
 
				-          providing enough of a share to small jobs to let them finish fast. 
			
 
				-          Boolean value, default: false.
			
 
				-        </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td>
			
 
				-          mapred.fairscheduler.poolnameproperty
			
 
				-        </td>
			
 
				-        <td>
			
 
				-          Specify which jobconf property is used to determine the pool that a
			
 
				-          job belongs in. String, default: user.name (i.e. one pool for each 
			
 
				-          user). Some other useful values to set this to are: <br/>
			
 
				-          <ul> 
			
 
				-            <li> group.name (to create a pool per Unix group).</li>
			
 
				-            <li>mapred.job.queue.name (the same property as the queue name in 
			
 
				-            <a href="capacity_scheduler.html">Capacity Scheduler</a>).</li>
			
 
				-          </ul>
			
 
				-        </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td>
			
 
				-          mapred.fairscheduler.weightadjuster
			
 
				-        </td>
			
 
				-        <td>
			
 
				-        An extensibility point that lets you specify a class to adjust the 
			
 
				-        weights of running jobs. This class should implement the 
			
 
				-        <em>WeightAdjuster</em> interface. There is currently one example 
			
 
				-        implementation - <em>NewJobWeightBooster</em>, which increases the 
			
 
				-        weight of jobs for the first 5 minutes of their lifetime to let 
			
 
				-        short jobs finish faster. To use it, set the weightadjuster 
			
 
				-        property to the full class name, 
			
 
				-        <code>org.apache.hadoop.mapred.NewJobWeightBooster</code> 
			
 
				-        NewJobWeightBooster itself provides two parameters for setting the 
			
 
				-        duration and boost factor. <br/>
			
 
				-        <ol>
			
 
				-        <li> <em>mapred.newjobweightbooster.factor</em>
			
 
				-          Factor by which new jobs weight should be boosted. Default is 3</li>
			
 
				-        <li><em>mapred.newjobweightbooster.duration</em>
			
 
				-          Duration in milliseconds, default 300000 for 5 minutes</li>
			
 
				-        </ol>
			
 
				-        </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td>
			
 
				-          mapred.fairscheduler.loadmanager
			
 
				-        </td>
			
 
				-        <td>
			
 
				-          An extensibility point that lets you specify a class that determines 
			
 
				-          how many maps and reduces can run on a given TaskTracker. This class 
			
 
				-          should implement the LoadManager interface. By default the task caps 
			
 
				-          in the Hadoop config file are used, but this option could be used to 
			
 
				-          make the load based on available memory and CPU utilization for example.
			
 
				-        </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td>
			
 
				-          mapred.fairscheduler.taskselector:
			
 
				-        </td>
			
 
				-        <td>
			
 
				-        An extensibility point that lets you specify a class that determines 
			
 
				-        which task from within a job to launch on a given tracker. This can be 
			
 
				-        used to change either the locality policy (e.g. keep some jobs within 
			
 
				-        a particular rack) or the speculative execution algorithm (select 
			
 
				-        when to launch speculative tasks). The default implementation uses 
			
 
				-        Hadoop's default algorithms from JobInProgress.
			
 
				-        </td>
			
 
				-        </tr>
			
 
				-      </table>      
			
 
				-    </section>
			
 
				-    <section>
			
 
				-    <title> Administration</title>
			
 
				-    <p>
			
 
				-      The fair scheduler provides support for administration at runtime 
			
 
				-      through two mechanisms:
			
 
				-    </p> 
			
 
				-    <ol>
			
 
				-    <li>
			
 
				-      It is possible to modify pools' allocations 
			
 
				-      and user and pool running job limits at runtime by editing the allocation 
			
 
				-      config file. The scheduler will reload this file 10-15 seconds after it 
			
 
				-      sees that it was modified.
			
 
				-     </li>
			
 
				-     <li>
			
 
				-     Current jobs, pools, and fair shares  can be examined through the 
			
 
				-     JobTracker's web interface, at  http://&lt;jobtracker URL&gt;/scheduler. 
			
 
				-     On this interface, it is also possible to modify jobs' priorities or 
			
 
				-     move jobs from one pool to another and see the effects on the fair 
			
 
				-     shares (this requires JavaScript).
			
 
				-     </li>
			
 
				-    </ol>
			
 
				-    <p>
			
 
				-      The following fields can be seen for each job on the web interface:
			
 
				-     </p>
			
 
				-     <ul>
			
 
				-     <li><em>Submitted</em> - Date and time job was submitted.</li>
			
 
				-     <li><em>JobID, User, Name</em> - Job identifiers as on the standard 
			
 
				-     web UI.</li>
			
 
				-     <li><em>Pool</em> - Current pool of job. Select another value to move job to 
			
 
				-     another pool.</li>
			
 
				-     <li><em>Priority</em> - Current priority. Select another value to change the 
			
 
				-     job's priority</li>
			
 
				-     <li><em>Maps/Reduces Finished</em>: Number of tasks finished / total tasks.</li>
			
 
				-     <li><em>Maps/Reduces Running</em>: Tasks currently running.</li>
			
 
				-     <li><em>Map/Reduce Fair Share</em>: The average number of task slots that this 
			
 
				-     job should have at any given time according to fair sharing. The actual
			
 
				-     number of tasks will go up and down depending on how much compute time
			
 
				-     the job has had, but on average it will get its fair share amount.</li>
			
 
				-     </ul>
			
 
				-     <p>
			
 
				-     In addition, it is possible to turn on an "advanced" view for the web UI,
			
 
				-     by going to http://&lt;jobtracker URL&gt;/scheduler?advanced. This view shows 
			
 
				-     four more columns used for calculations internally:
			
 
				-     </p>
			
 
				-     <ul>
			
 
				-     <li><em>Maps/Reduce Weight</em>: Weight of the job in the fair sharing 
			
 
				-     calculations. This depends on priority and potentially also on 
			
 
				-     job size and job age if the <em>sizebasedweight</em> and 
			
 
				-     <em>NewJobWeightBooster</em> are enabled.</li>
			
 
				-     <li><em>Map/Reduce Deficit</em>: The job's scheduling deficit in machine-
			
 
				-     seconds - the amount of resources it should have gotten according to 
			
 
				-     its fair share, minus how many it actually got. Positive deficit means
			
 
				-      the job will be scheduled again in the near future because it needs to 
			
 
				-      catch up to its fair share. The scheduler schedules jobs with higher 
			
 
				-      deficit ahead of others. Please see the Implementation section of 
			
 
				-      this document for details.</li>
			
 
				-     </ul>
			
 
				-    </section>
			
 
				-    <section>
			
 
				-    <title>Implementation</title>
			
 
				-    <p>There are two aspects to implementing fair scheduling: Calculating 
			
 
				-    each job's fair share, and choosing which job to run when a task slot 
			
 
				-    becomes available.</p>
			
 
				-    <p>To select jobs to run, the scheduler then keeps track of a 
			
 
				-    &quot;deficit&quot; for each job - the difference between the amount of
			
 
				-     compute time it should have gotten on an ideal scheduler, and the amount 
			
 
				-     of compute time it actually got. This is a measure of how 
			
 
				-     &quot;unfair&quot; we've been to the job. Every few hundred 
			
 
				-     milliseconds, the scheduler updates the deficit of each job by looking
			
 
				-     at how many tasks each job had running during this interval vs. its 
			
 
				-     fair share. Whenever a task slot becomes available, it is assigned to 
			
 
				-     the job with the highest deficit. There is one exception - if there 
			
 
				-     were one or more jobs who were not meeting their pool capacity 
			
 
				-     guarantees, we only choose among these &quot;needy&quot; jobs (based 
			
 
				-     again on their deficit), to ensure that the scheduler meets pool 
			
 
				-     guarantees as soon as possible.</p>
			
 
				-     <p>
			
 
				-     The fair shares are calculated by dividing the capacity of the cluster 
			
 
				-     among runnable jobs according to a &quot;weight&quot; for each job. By 
			
 
				-     default the weight is based on priority, with each level of priority 
			
 
				-     having 2x higher weight than the next (for example, VERY_HIGH has 4x the 
			
 
				-     weight of NORMAL). However, weights can also be based on job sizes and ages, 
			
 
				-     as described in the Configuring section. For jobs that are in a pool, 
			
 
				-     fair shares also take into account the minimum guarantee for that pool. 
			
 
				-     This capacity is divided among the jobs in that pool according again to 
			
 
				-     their weights.
			
 
				-     </p>
			
 
				-     <p>Finally, when limits on a user's running jobs or a pool's running jobs 
			
 
				-     are in place, we choose which jobs get to run by sorting all jobs in order 
			
 
				-     of priority and then submit time, as in the standard Hadoop scheduler. Any 
			
 
				-     jobs that fall after the user/pool's limit in this ordering are queued up 
			
 
				-     and wait idle until they can be run. During this time, they are ignored 
			
 
				-     from the fair sharing calculations and do not gain or lose deficit (their 
			
 
				-     fair share is set to zero).</p>
			
 
				-    </section>
			
 
				-  </body>  
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/faultinject_framework.xml
+++ b/src/docs/src/documentation/content/xdocs/faultinject_framework.xml
@@ -21,41 +21,40 @@
 
				 
			
 
				 <document>
			
 
				   <header>
			
 
				-    <title>Fault injection Framework and Development Guide</title>
			
 
				+    <title>Fault Injection Framework and Development Guide</title>
			
 
				   </header>
			
 
				 
			
 
				   <body>
			
 
				     <section>
			
 
				       <title>Introduction</title>
			
 
				-      <p>The following is a brief help for Hadoops' Fault Injection (FI)
			
 
				-        Framework and Developer's Guide for those who will be developing
			
 
				-        their own faults (aspects).
			
 
				+      <p>This guide provides an overview of the Hadoop Fault Injection (FI) framework for those
			
 
				+      who will be developing their own faults (aspects).
			
 
				       </p>
			
 
				-      <p>An idea of Fault Injection (FI) is fairly simple: it is an
			
 
				+      <p>The idea of fault injection is fairly simple: it is an
			
 
				         infusion of errors and exceptions into an application's logic to
			
 
				         achieve a higher coverage and fault tolerance of the system.
			
 
				-        Different implementations of this idea are available at this day.
			
 
				+        Different implementations of this idea are available today.
			
 
				         Hadoop's FI framework is built on top of Aspect Oriented Paradigm
			
 
				         (AOP) implemented by AspectJ toolkit.
			
 
				       </p>
			
 
				     </section>
			
 
				     <section>
			
 
				       <title>Assumptions</title>
			
 
				-      <p>The current implementation of the framework assumes that the faults it
			
 
				-        will be emulating are of non-deterministic nature. i.e. the moment
			
 
				-        of a fault's happening isn't known in advance and is a coin-flip
			
 
				-        based.
			
 
				+      <p>The current implementation of the FI framework assumes that the faults it
			
 
				+        will be emulating are of non-deterministic nature. That is,  the moment
			
 
				+        of a fault's happening isn't known in advance and is a coin-flip based.
			
 
				       </p>
			
 
				     </section>
			
 
				+    
			
 
				     <section>
			
 
				       <title>Architecture of the Fault Injection Framework</title>
			
 
				       <figure src="images/FI-framework.gif" alt="Components layout" />
			
 
				+      
			
 
				       <section>
			
 
				-        <title>Configuration management</title>
			
 
				-        <p>This piece of the framework allow to
			
 
				-          set expectations for faults to happen. The settings could be applied
			
 
				-          either statically (in advance) or in a runtime. There's two ways to
			
 
				-          configure desired level of faults in the framework:
			
 
				+        <title>Configuration Management</title>
			
 
				+        <p>This piece of the FI framework allows you to set expectations for faults to happen. 
			
 
				+        The settings can be applied either statically (in advance) or in runtime. 
			
 
				+        The desired level of faults in the framework can be configured two ways:
			
 
				         </p>
			
 
				         <ul>
			
 
				           <li>
			
@@ -71,31 +70,31 @@
 
				           </li>
			
 
				         </ul>
			
 
				       </section>
			
 
				+      
			
 
				       <section>
			
 
				-        <title>Probability model</title>
			
 
				-        <p>This fundamentally is a coin flipper. The methods of this class are
			
 
				+        <title>Probability Model</title>
			
 
				+        <p>This is fundamentally a coin flipper. The methods of this class are
			
 
				           getting a random number between 0.0
			
 
				-          and 1.0 and then checking if new number has happened to be in the
			
 
				-          range of
			
 
				-          0.0 and a configured level for the fault in question. If that
			
 
				-          condition
			
 
				-          is true then the fault will occur.
			
 
				+          and 1.0 and then checking if a new number has happened in the
			
 
				+          range of 0.0 and a configured level for the fault in question. If that
			
 
				+          condition is true then the fault will occur.
			
 
				         </p>
			
 
				-        <p>Thus, to guarantee a happening of a fault one needs to set an
			
 
				+        <p>Thus, to guarantee the happening of a fault one needs to set an
			
 
				           appropriate level to 1.0.
			
 
				           To completely prevent a fault from happening its probability level
			
 
				-          has to be set to 0.0
			
 
				+          has to be set to 0.0.
			
 
				         </p>
			
 
				-        <p><strong>Nota bene</strong>: default probability level is set to 0
			
 
				+        <p><strong>Note</strong>: The default probability level is set to 0
			
 
				           (zero) unless the level is changed explicitly through the
			
 
				           configuration file or in the runtime. The name of the default
			
 
				           level's configuration parameter is
			
 
				           <code>fi.*</code>
			
 
				         </p>
			
 
				       </section>
			
 
				+      
			
 
				       <section>
			
 
				-        <title>Fault injection mechanism: AOP and AspectJ</title>
			
 
				-        <p>In the foundation of Hadoop's fault injection framework lays
			
 
				+        <title>Fault Injection Mechanism: AOP and AspectJ</title>
			
 
				+        <p>The foundation of Hadoop's FI framework includes a
			
 
				           cross-cutting concept implemented by AspectJ. The following basic
			
 
				           terms are important to remember:
			
 
				         </p>
			
@@ -122,8 +121,9 @@
 
				           </li>
			
 
				         </ul>
			
 
				       </section>
			
 
				+      
			
 
				       <section>
			
 
				-        <title>Existing join points</title>
			
 
				+        <title>Existing Join Points</title>
			
 
				         <p>
			
 
				           The following readily available join points are provided by AspectJ:
			
 
				         </p>
			
@@ -154,7 +154,7 @@
 
				       </section>
			
 
				     </section>
			
 
				     <section>
			
 
				-      <title>Aspects examples</title>
			
 
				+      <title>Aspect Example</title>
			
 
				       <source>
			
 
				 package org.apache.hadoop.hdfs.server.datanode;
			
 
				 
			
@@ -191,17 +191,22 @@ public aspect BlockReceiverAspects {
 
				     }
			
 
				   }
			
 
				 }
			
 
				-      </source>
			
 
				-      <p>
			
 
				-        The aspect has two main parts: the join point
			
 
				+</source>
			
 
				+
			
 
				+      <p>The aspect has two main parts: </p>
			
 
				+       <ul>
			
 
				+        <li>The join point
			
 
				         <code>pointcut callReceivepacket()</code>
			
 
				         which servers as an identification mark of a specific point (in control
			
 
				-        and/or data flow) in the life of an application. A call to the advice -
			
 
				+        and/or data flow) in the life of an application. </li>
			
 
				+        
			
 
				+       <li> A call to the advice -
			
 
				         <code>before () throws IOException : callReceivepacket()</code>
			
 
				-        - will be
			
 
				-        <a href="#Putting+it+all+together">injected</a>
			
 
				-        before that specific spot of the application's code.
			
 
				-      </p>
			
 
				+        - will be injected (see
			
 
				+        <a href="#Putting+it+all+together">Putting It All Together</a>)
			
 
				+        before that specific spot of the application's code.</li>
			
 
				+        </ul>
			
 
				+      
			
 
				 
			
 
				       <p>The pointcut identifies an invocation of class'
			
 
				         <code>java.io.OutputStream write()</code>
			
@@ -210,8 +215,8 @@ public aspect BlockReceiverAspects {
 
				         take place within the body of method
			
 
				         <code>receivepacket()</code>
			
 
				         from class<code>BlockReceiver</code>.
			
 
				-        The method can have any parameters and any return type. possible
			
 
				-        invocations of
			
 
				+        The method can have any parameters and any return type. 
			
 
				+        Possible invocations of
			
 
				         <code>write()</code>
			
 
				         method happening anywhere within the aspect
			
 
				         <code>BlockReceiverAspects</code>
			
@@ -222,24 +227,22 @@ public aspect BlockReceiverAspects {
 
				         class. In such a case the names of the faults have to be different
			
 
				         if a developer wants to trigger them separately.
			
 
				       </p>
			
 
				-      <p><strong>Note 2</strong>: After
			
 
				-        <a href="#Putting+it+all+together">injection step</a>
			
 
				+      <p><strong>Note 2</strong>: After the injection step (see
			
 
				+        <a href="#Putting+it+all+together">Putting It All Together</a>)
			
 
				         you can verify that the faults were properly injected by
			
 
				-        searching for
			
 
				-        <code>ajc</code>
			
 
				-        keywords in a disassembled class file.
			
 
				+        searching for <code>ajc</code> keywords in a disassembled class file.
			
 
				       </p>
			
 
				 
			
 
				     </section>
			
 
				     
			
 
				     <section>
			
 
				-      <title>Fault naming convention &amp; namespaces</title>
			
 
				-      <p>For the sake of unified naming
			
 
				+      <title>Fault Naming Convention and Namespaces</title>
			
 
				+      <p>For the sake of a unified naming
			
 
				       convention the following two types of names are recommended for a
			
 
				       new aspects development:</p>
			
 
				       <ul>
			
 
				-        <li>Activity specific notation (as
			
 
				-          when we don't care about a particular location of a fault's
			
 
				+        <li>Activity specific notation 
			
 
				+          (when we don't care about a particular location of a fault's
			
 
				           happening). In this case the name of the fault is rather abstract:
			
 
				           <code>fi.hdfs.DiskError</code>
			
 
				         </li>
			
@@ -251,14 +254,11 @@ public aspect BlockReceiverAspects {
 
				     </section>
			
 
				 
			
 
				     <section>
			
 
				-      <title>Development tools</title>
			
 
				+      <title>Development Tools</title>
			
 
				       <ul>
			
 
				-        <li>Eclipse
			
 
				-          <a href="http://www.eclipse.org/ajdt/">AspectJ
			
 
				-            Development Toolkit
			
 
				-          </a>
			
 
				-          might help you in the aspects' development
			
 
				-          process.
			
 
				+        <li>The Eclipse
			
 
				+          <a href="http://www.eclipse.org/ajdt/">AspectJ Development Toolkit</a> 
			
 
				+          may help you when developing aspects
			
 
				         </li>
			
 
				         <li>IntelliJ IDEA provides AspectJ weaver and Spring-AOP plugins
			
 
				         </li>
			
@@ -266,60 +266,67 @@ public aspect BlockReceiverAspects {
 
				     </section>
			
 
				 
			
 
				     <section>
			
 
				-      <title>Putting it all together</title>
			
 
				-      <p>Faults (or aspects) have to injected (or woven) together before
			
 
				-        they can be used. Here's a step-by-step instruction how this can be
			
 
				-        done.</p>
			
 
				-      <p>Weaving aspects in place:</p>
			
 
				-      <source>
			
 
				+      <title>Putting It All Together</title>
			
 
				+      <p>Faults (aspects) have to injected (or woven) together before
			
 
				+        they can be used. Follow these instructions:</p>
			
 
				+        
			
 
				+    <ul>
			
 
				+      <li>To weave aspects in place use:
			
 
				+<source>
			
 
				 % ant injectfaults
			
 
				-      </source>
			
 
				-      <p>If you
			
 
				-        misidentified the join point of your aspect then you'll see a
			
 
				-        warning similar to this one below when 'injectfaults' target is
			
 
				-        completed:</p>
			
 
				-        <source>
			
 
				+</source>
			
 
				+      </li>
			
 
				+      
			
 
				+      <li>If you
			
 
				+        misidentified the join point of your aspect you will see a
			
 
				+        warning (similar to the one shown here) when 'injectfaults' target is
			
 
				+        completed:
			
 
				+<source>
			
 
				 [iajc] warning at
			
 
				 src/test/aop/org/apache/hadoop/hdfs/server/datanode/ \
			
 
				           BlockReceiverAspects.aj:44::0
			
 
				 advice defined in org.apache.hadoop.hdfs.server.datanode.BlockReceiverAspects
			
 
				 has not been applied [Xlint:adviceDidNotMatch]
			
 
				-        </source>
			
 
				-      <p>It isn't an error, so the build will report the successful result.
			
 
				-
			
 
				-        To prepare dev.jar file with all your faults weaved in
			
 
				-      place run (HDFS-475 pending)</p>
			
 
				-        <source>
			
 
				+</source>
			
 
				+        </li>
			
 
				+        
			
 
				+      <li>It isn't an error, so the build will report the successful result. <br />
			
 
				+     To prepare dev.jar file with all your faults weaved in place (HDFS-475 pending) use:
			
 
				+<source>
			
 
				 % ant jar-fault-inject
			
 
				-        </source>
			
 
				+</source>
			
 
				+        </li>
			
 
				 
			
 
				-      <p>Test jars can be created by</p>
			
 
				-        <source>
			
 
				+     <li>To create test jars use:
			
 
				+<source>
			
 
				 % ant jar-test-fault-inject
			
 
				-        </source>
			
 
				+</source>
			
 
				+      </li>
			
 
				 
			
 
				-      <p>To run HDFS tests with faults injected:</p>
			
 
				-        <source>
			
 
				+     <li>To run HDFS tests with faults injected use:
			
 
				+<source>
			
 
				 % ant run-test-hdfs-fault-inject
			
 
				-        </source>
			
 
				+</source>
			
 
				+      </li>
			
 
				+    </ul>
			
 
				+        
			
 
				       <section>
			
 
				-        <title>How to use fault injection framework</title>
			
 
				-        <p>Faults could be triggered by the following two meanings:
			
 
				+        <title>How to Use the Fault Injection Framework</title>
			
 
				+        <p>Faults can be triggered as follows:
			
 
				         </p>
			
 
				         <ul>
			
 
				-          <li>In the runtime as:
			
 
				-            <source>
			
 
				+          <li>During runtime:
			
 
				+<source>
			
 
				 % ant run-test-hdfs -Dfi.hdfs.datanode.BlockReceiver=0.12
			
 
				-            </source>
			
 
				-            To set a certain level, e.g. 25%, of all injected faults one can run
			
 
				+</source>
			
 
				+            To set a certain level, for example 25%, of all injected faults use:
			
 
				             <br/>
			
 
				-            <source>
			
 
				+<source>
			
 
				 % ant run-test-hdfs-fault-inject -Dfi.*=0.25
			
 
				-            </source>
			
 
				+</source>
			
 
				           </li>
			
 
				-          <li>or from a program as follows:
			
 
				-          </li>
			
 
				-        </ul>
			
 
				+          <li>From a program:
			
 
				+  
			
 
				         <source>
			
 
				 package org.apache.hadoop.fs;
			
 
				 
			
@@ -354,23 +361,23 @@ public class DemoFiTest extends TestCase {
 
				     //Cleaning up test test environment
			
 
				   }
			
 
				 }
			
 
				-        </source>
			
 
				+</source>
			
 
				+        </li>
			
 
				+        </ul>
			
 
				+        
			
 
				         <p>
			
 
				-          as you can see above these two methods do the same thing. They are
			
 
				-          setting the probability level of
			
 
				-          <code>hdfs.datanode.BlockReceiver</code>
			
 
				-          at 12%.
			
 
				-          The difference, however, is that the program provides more
			
 
				-          flexibility and allows to turn a fault off when a test doesn't need
			
 
				-          it anymore.
			
 
				+          As you can see above these two methods do the same thing. They are
			
 
				+          setting the probability level of <code>hdfs.datanode.BlockReceiver</code>
			
 
				+          at 12%. The difference, however, is that the program provides more
			
 
				+          flexibility and allows you to turn a fault off when a test no longer needs it.
			
 
				         </p>
			
 
				       </section>
			
 
				     </section>
			
 
				 
			
 
				     <section>
			
 
				-      <title>Additional information and contacts</title>
			
 
				-      <p>This two sources of information seem to be particularly
			
 
				-        interesting and worth further reading:
			
 
				+      <title>Additional Information and Contacts</title>
			
 
				+      <p>These two sources of information are particularly
			
 
				+        interesting and worth reading:
			
 
				       </p>
			
 
				       <ul>
			
 
				         <li>
			
@@ -381,9 +388,8 @@ public class DemoFiTest extends TestCase {
 
				         <li>AspectJ Cookbook (ISBN-13: 978-0-596-00654-9)
			
 
				         </li>
			
 
				       </ul>
			
 
				-      <p>Should you have any farther comments or questions to the author
			
 
				-        check
			
 
				-        <a href="http://issues.apache.org/jira/browse/HDFS-435">HDFS-435</a>
			
 
				+      <p>If you have additional comments or questions for the author check
			
 
				+        <a href="http://issues.apache.org/jira/browse/HDFS-435">HDFS-435</a>.
			
 
				       </p>
			
 
				     </section>
			
 
				   </body>
			
--- a/src/docs/src/documentation/content/xdocs/hadoop_archives.xml
+++ b/src/docs/src/documentation/content/xdocs/hadoop_archives.xml
@@ -1,80 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-<document>
			
 
				-        <header>
			
 
				-        <title>Archives Guide</title>
			
 
				-        </header>
			
 
				-        <body>
			
 
				-        <section>
			
 
				-        <title> What are Hadoop archives? </title>
			
 
				-        <p>
			
 
				-        Hadoop archives are special format archives. A Hadoop archive
			
 
				-        maps to a file system directory. A Hadoop archive always has a *.har
			
 
				-        extension. A Hadoop archive directory contains metadata (in the form 
			
 
				-        of _index and _masterindex) and data (part-*) files. The _index file contains
			
 
				-        the name of the files that are part of the archive and the location
			
 
				-        within the part files. 
			
 
				-        </p>
			
 
				-        </section>
			
 
				-        <section>
			
 
				-        <title> How to create an archive? </title>
			
 
				-        <p>
			
 
				-        <code>Usage: hadoop archive -archiveName name &lt;src&gt;* &lt;dest&gt;</code>
			
 
				-        </p>
			
 
				-        <p>
			
 
				-        -archiveName is the name of the archive you would like to create. 
			
 
				-        An example would be foo.har. The name should have a *.har extension. 
			
 
				-        The inputs are file system pathnames which work as usual with regular
			
 
				-        expressions. The destination directory would contain the archive.
			
 
				-        Note that this is a Map/Reduce job that creates the archives. You would
			
 
				-        need a map reduce cluster to run this. The following is an example:</p>
			
 
				-        <p>
			
 
				-        <code>hadoop archive -archiveName foo.har /user/hadoop/dir1 /user/hadoop/dir2 /user/zoo/</code>
			
 
				-        </p><p>
			
 
				-        In the above example /user/hadoop/dir1 and /user/hadoop/dir2 will be
			
 
				-        archived in the following file system directory -- /user/zoo/foo.har.
			
 
				-        The sources are not changed or removed when an archive is created.
			
 
				-        </p>
			
 
				-        </section>
			
 
				-        <section>
			
 
				-        <title> How to look up files in archives? </title>
			
 
				-        <p>
			
 
				-        The archive exposes itself as a file system layer. So all the fs shell
			
 
				-        commands in the archives work but with a different URI. Also, note that
			
 
				-        archives are immutable. So, rename's, deletes and creates return
			
 
				-        an error. URI for Hadoop Archives is 
			
 
				-        </p><p><code>har://scheme-hostname:port/archivepath/fileinarchive</code></p><p>
			
 
				-        If no scheme is provided it assumes the underlying filesystem. 
			
 
				-        In that case the URI would look like 
			
 
				-        </p><p><code>
			
 
				-        har:///archivepath/fileinarchive</code></p>
			
 
				-        <p>
			
 
				-        Here is an example of archive. The input to the archives is /dir. The directory dir contains 
			
 
				-        files filea, fileb. To archive /dir to /user/hadoop/foo.har, the command is 
			
 
				-        </p>
			
 
				-        <p><code>hadoop archive -archiveName foo.har /dir /user/hadoop</code>
			
 
				-        </p><p>
			
 
				-        To get file listing for files in the created archive 
			
 
				-        </p>
			
 
				-        <p><code>hadoop dfs -lsr har:///user/hadoop/foo.har</code></p>
			
 
				-        <p>To cat filea in archive -
			
 
				-        </p><p><code>hadoop dfs -cat har:///user/hadoop/foo.har/dir/filea</code></p>
			
 
				-        </section>
			
 
				-	</body>
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/hdfs_design.xml
+++ b/src/docs/src/documentation/content/xdocs/hdfs_design.xml
@@ -24,7 +24,7 @@
 
				 
			
 
				   <header>
			
 
				     <title> 
			
 
				-      HDFS Architecture
			
 
				+      HDFS Architecture Guide
			
 
				     </title>
			
 
				     <authors>
			
 
				       <person name="Dhruba Borthakur" email="dhruba@yahoo-inc.com"/>
			
@@ -35,7 +35,13 @@
 
				     <section>
			
 
				       <title> Introduction </title>
			
 
				       <p>
			
 
				-      The Hadoop Distributed File System (<acronym title="Hadoop Distributed File System">HDFS</acronym>) is a distributed file system designed to run on commodity hardware. It has many similarities with existing distributed file systems. However, the differences from other distributed file systems are significant. HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware. HDFS provides high throughput access to application data and is suitable for applications that have large data sets. HDFS relaxes a few POSIX requirements to enable streaming access to file system data.  HDFS was originally built as infrastructure for the Apache Nutch web search engine project. HDFS is part of the Apache Hadoop Core project. The project URL is <a href="http://hadoop.apache.org/core/">http://hadoop.apache.org/core/</a>.
			
 
				+      The Hadoop Distributed File System (<acronym title="Hadoop Distributed File System">HDFS</acronym>) is a distributed file system 
			
 
				+      designed to run on commodity hardware. It has many similarities with existing distributed file systems. However, the differences from 
			
 
				+      other distributed file systems are significant. HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware. 
			
 
				+      HDFS provides high throughput access to application data and is suitable for applications that have large data sets. HDFS relaxes 
			
 
				+      a few POSIX requirements to enable streaming access to file system data.  HDFS was originally built as infrastructure for the 
			
 
				+      Apache Nutch web search engine project. HDFS is now an Apache Hadoop subproject.
			
 
				+      The project URL is <a href="http://hadoop.apache.org/hdfs/">http://hadoop.apache.org/hdfs/</a>.
			
 
				       </p>
			
 
				     </section>
			
 
				 
			
@@ -45,7 +51,10 @@
 
				       <section> 
			
 
				         <title> Hardware Failure </title>
			
 
				         <p>
			
 
				-        Hardware failure is the norm rather than the exception. An HDFS instance may consist of hundreds or thousands of server machines, each storing part of the file system&#x2019;s data. The fact that there are a huge number of components and that each component has a non-trivial probability of failure means that some component of HDFS is always non-functional. Therefore, detection of faults and quick, automatic recovery from them is a core architectural goal of HDFS.
			
 
				+        Hardware failure is the norm rather than the exception. An HDFS instance may consist of hundreds or thousands of server machines, 
			
 
				+        each storing part of the file system&#x2019;s data. The fact that there are a huge number of components and that each component has 
			
 
				+        a non-trivial probability of failure means that some component of HDFS is always non-functional. Therefore, detection of faults and quick, 
			
 
				+        automatic recovery from them is a core architectural goal of HDFS.
			
 
				        </p>
			
 
				      </section>
			
 
				 
			
@@ -53,14 +62,19 @@
 
				       <section> 
			
 
				         <title> Streaming Data Access </title>
			
 
				         <p>
			
 
				-        Applications that run on HDFS need streaming access to their data sets. They are not general purpose applications that typically run on general purpose file systems. HDFS is designed more for batch processing rather than interactive use by users. The emphasis is on high throughput of data access rather than low latency of data access. POSIX imposes many hard requirements that are not needed for applications that are targeted for HDFS. POSIX semantics in a few key areas has been traded to increase data throughput rates. 
			
 
				+        Applications that run on HDFS need streaming access to their data sets. They are not general purpose applications that typically run 
			
 
				+        on general purpose file systems. HDFS is designed more for batch processing rather than interactive use by users. The emphasis is on 
			
 
				+        high throughput of data access rather than low latency of data access. POSIX imposes many hard requirements that are not needed for 
			
 
				+        applications that are targeted for HDFS. POSIX semantics in a few key areas has been traded to increase data throughput rates. 
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
 
				       <section> 
			
 
				         <title> Large Data Sets </title>
			
 
				         <p>
			
 
				-        Applications that run on HDFS have large data sets. A typical file in HDFS is gigabytes to terabytes in size. Thus, HDFS is tuned to support large files. It should provide high aggregate data bandwidth and scale to hundreds of nodes in a single cluster. It should support tens of millions of files in a single instance.
			
 
				+        Applications that run on HDFS have large data sets. A typical file in HDFS is gigabytes to terabytes in size. Thus, HDFS is tuned to 
			
 
				+        support large files. It should provide high aggregate data bandwidth and scale to hundreds of nodes in a single cluster. It should support 
			
 
				+        tens of millions of files in a single instance.
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
@@ -68,7 +82,9 @@
 
				       <section> 
			
 
				         <title> Simple Coherency Model </title>
			
 
				         <p>
			
 
				-        HDFS applications need a write-once-read-many access model for files. A file once created, written, and closed need not be changed. This assumption simplifies data coherency issues and enables high throughput data access. A Map/Reduce application or a web crawler application fits perfectly with this model. There is a plan to support appending-writes to files in the future. 
			
 
				+        HDFS applications need a write-once-read-many access model for files. A file once created, written, and closed need not be changed. 
			
 
				+        This assumption simplifies data coherency issues and enables high throughput data access. A MapReduce application or a web crawler 
			
 
				+        application fits perfectly with this model. There is a plan to support appending-writes to files in the future. 
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
@@ -76,7 +92,10 @@
 
				       <section> 
			
 
				         <title> &#x201c;Moving Computation is Cheaper than Moving Data&#x201d; </title>
			
 
				         <p>
			
 
				-        A computation requested by an application is much more efficient if it is executed near the data it operates on. This is especially true when the size of the data set is huge. This minimizes network congestion and increases the overall throughput of the system. The assumption is that it is often better to migrate the computation closer to where the data is located rather than moving the data to where the application is running. HDFS provides interfaces for applications to move themselves closer to where the data is located. 
			
 
				+        A computation requested by an application is much more efficient if it is executed near the data it operates on. This is especially true 
			
 
				+        when the size of the data set is huge. This minimizes network congestion and increases the overall throughput of the system. The 
			
 
				+        assumption is that it is often better to migrate the computation closer to where the data is located rather than moving the data to where 
			
 
				+        the application is running. HDFS provides interfaces for applications to move themselves closer to where the data is located. 
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
@@ -84,7 +103,8 @@
 
				       <section> 
			
 
				         <title> Portability Across Heterogeneous Hardware and Software Platforms </title>
			
 
				         <p>
			
 
				-        HDFS has been designed to be easily portable from one platform to another. This facilitates widespread adoption of HDFS as a platform of choice for a large set of applications. 
			
 
				+        HDFS has been designed to be easily portable from one platform to another. This facilitates widespread adoption of HDFS as a 
			
 
				+        platform of choice for a large set of applications. 
			
 
				         </p>
			
 
				       </section>
			
 
				     </section>
			
@@ -93,14 +113,28 @@
 
				     <section>
			
 
				       <title> NameNode and DataNodes </title>
			
 
				       <p>
			
 
				-      HDFS has a master/slave architecture. An HDFS cluster consists of a single NameNode, a master server that manages the file system namespace and regulates access to files by clients. In addition, there are a number of DataNodes, usually one per node in the cluster, which manage storage attached to the nodes that they run on. HDFS exposes a file system namespace and allows user data to be stored in files. Internally, a file is split into one or more blocks and these blocks are stored in a set of DataNodes. The NameNode executes file system namespace operations like opening, closing, and renaming files and directories. It also determines the mapping of blocks to DataNodes. The DataNodes are responsible for serving read and write requests from the file system&#x2019;s clients. The DataNodes also perform block creation, deletion, and replication upon instruction from the NameNode.
			
 
				+      HDFS has a master/slave architecture. An HDFS cluster consists of a single NameNode, a master server that manages the file 
			
 
				+      system namespace and regulates access to files by clients. In addition, there are a number of DataNodes, usually one per node 
			
 
				+      in the cluster, which manage storage attached to the nodes that they run on. HDFS exposes a file system namespace and allows 
			
 
				+      user data to be stored in files. Internally, a file is split into one or more blocks and these blocks are stored in a set of DataNodes. 
			
 
				+      The NameNode executes file system namespace operations like opening, closing, and renaming files and directories. It also 
			
 
				+      determines the mapping of blocks to DataNodes. The DataNodes are responsible for serving read and write requests from the file 
			
 
				+      system&#x2019;s clients. The DataNodes also perform block creation, deletion, and replication upon instruction from the NameNode.
			
 
				       </p>
			
 
				-      <figure alt="HDFS Architecture" src="images/hdfsarchitecture.gif"/>
			
 
				+      
			
 
				+           <figure alt="HDFS Architecture" src="images/hdfsarchitecture.gif"/>
			
 
				+
			
 
				       <p>
			
 
				-      The NameNode and DataNode are pieces of software designed to run on commodity machines. These machines typically run a GNU/Linux operating system (<acronym title="operating system">OS</acronym>). HDFS is built using the Java language; any machine that supports Java can run the NameNode or the DataNode software. Usage of the highly portable Java language means that HDFS can be deployed on a wide range of machines. A typical deployment has a dedicated machine that runs only the NameNode software. Each of the other machines in the cluster runs one instance of the DataNode software. The architecture does not preclude running multiple DataNodes on the same machine but in a real deployment that is rarely the case.
			
 
				+      The NameNode and DataNode are pieces of software designed to run on commodity machines. These machines typically run a 
			
 
				+      GNU/Linux operating system (<acronym title="operating system">OS</acronym>). HDFS is built using the Java language; any 
			
 
				+      machine that supports Java can run the NameNode or the DataNode software. Usage of the highly portable Java language means 
			
 
				+      that HDFS can be deployed on a wide range of machines. A typical deployment has a dedicated machine that runs only the 
			
 
				+      NameNode software. Each of the other machines in the cluster runs one instance of the DataNode software. The architecture 
			
 
				+      does not preclude running multiple DataNodes on the same machine but in a real deployment that is rarely the case.
			
 
				       </p>
			
 
				       <p>
			
 
				-      The existence of a single NameNode in a cluster greatly simplifies the architecture of the system. The NameNode is the arbitrator and repository for all HDFS metadata. The system is designed in such a way that user data never flows through the NameNode.
			
 
				+      The existence of a single NameNode in a cluster greatly simplifies the architecture of the system. The NameNode is the arbitrator 
			
 
				+      and repository for all HDFS metadata. The system is designed in such a way that user data never flows through the NameNode.
			
 
				       </p>
			
 
				     </section>
			
 
				 
			
@@ -109,10 +143,15 @@
 
				     <section>
			
 
				       <title> The File System Namespace </title>
			
 
				       <p>
			
 
				-      HDFS supports a traditional hierarchical file organization. A user or an application can create directories and store files inside these directories. The file system namespace hierarchy is similar to most other existing file systems; one can create and remove files, move a file from one directory to another, or rename a file. HDFS does not yet implement user quotas. HDFS does not support hard links or soft links. However, the HDFS architecture does not preclude implementing these features.
			
 
				+      HDFS supports a traditional hierarchical file organization. A user or an application can create directories and store files inside 
			
 
				+      these directories. The file system namespace hierarchy is similar to most other existing file systems; one can create and 
			
 
				+      remove files, move a file from one directory to another, or rename a file. HDFS does not yet implement user quotas. HDFS 
			
 
				+      does not support hard links or soft links. However, the HDFS architecture does not preclude implementing these features.
			
 
				       </p>
			
 
				       <p>
			
 
				-      The NameNode maintains the file system namespace. Any change to the file system namespace or its properties is recorded by the NameNode. An application can specify the number of replicas of a file that should be maintained by HDFS. The number of copies of a file is called the replication factor of that file. This information is stored by the NameNode.
			
 
				+      The NameNode maintains the file system namespace. Any change to the file system namespace or its properties is 
			
 
				+      recorded by the NameNode. An application can specify the number of replicas of a file that should be maintained by 
			
 
				+      HDFS. The number of copies of a file is called the replication factor of that file. This information is stored by the NameNode.
			
 
				       </p>
			
 
				     </section>
			
 
				 
			
@@ -121,26 +160,52 @@
 
				     <section> 
			
 
				       <title> Data Replication </title>
			
 
				       <p>
			
 
				-      HDFS is designed to reliably store very large files across machines in a large cluster. It stores each file as a sequence of blocks; all blocks in a file except the last block are the same size. The blocks of a file are replicated for fault tolerance. The block size and replication factor are configurable per file. An application can specify the number of replicas of a file. The replication factor can be specified at file creation time and can be changed later. Files in HDFS are write-once and have strictly one writer at any time. 
			
 
				+      HDFS is designed to reliably store very large files across machines in a large cluster. It stores each file as a sequence 
			
 
				+      of blocks; all blocks in a file except the last block are the same size. The blocks of a file are replicated for fault tolerance. 
			
 
				+      The block size and replication factor are configurable per file. An application can specify the number of replicas of a file. 
			
 
				+      The replication factor can be specified at file creation time and can be changed later. Files in HDFS are write-once and 
			
 
				+      have strictly one writer at any time. 
			
 
				       </p>
			
 
				       <p>
			
 
				-      The NameNode makes all decisions regarding replication of blocks. It periodically receives a Heartbeat and a Blockreport from each of the DataNodes in the cluster. Receipt of a Heartbeat implies that the DataNode is functioning properly. A Blockreport contains a list of all blocks on a DataNode. 
			
 
				+      The NameNode makes all decisions regarding replication of blocks. It periodically receives a Heartbeat and a Blockreport 
			
 
				+      from each of the DataNodes in the cluster. Receipt of a Heartbeat implies that the DataNode is functioning properly. A 
			
 
				+      Blockreport contains a list of all blocks on a DataNode. 
			
 
				     </p>
			
 
				     <figure alt="HDFS DataNodes" src="images/hdfsdatanodes.gif"/>
			
 
				 
			
 
				       <section>
			
 
				         <title> Replica Placement: The First Baby Steps </title>
			
 
				         <p>
			
 
				-        The placement of replicas is critical to HDFS reliability and performance. Optimizing replica placement distinguishes HDFS from most other distributed file systems. This is a feature that needs lots of tuning and experience. The purpose of a rack-aware replica placement policy is to improve data reliability, availability, and network bandwidth utilization. The current implementation for the replica placement policy is a first effort in this direction. The short-term goals of implementing this policy are to validate it on production systems, learn more about its behavior, and build a foundation to test and research more sophisticated policies. 
			
 
				+        The placement of replicas is critical to HDFS reliability and performance. Optimizing replica placement distinguishes 
			
 
				+        HDFS from most other distributed file systems. This is a feature that needs lots of tuning and experience. The purpose 
			
 
				+        of a rack-aware replica placement policy is to improve data reliability, availability, and network bandwidth utilization. 
			
 
				+        The current implementation for the replica placement policy is a first effort in this direction. The short-term goals of 
			
 
				+        implementing this policy are to validate it on production systems, learn more about its behavior, and build a foundation 
			
 
				+        to test and research more sophisticated policies. 
			
 
				         </p>
			
 
				         <p>
			
 
				-        Large HDFS instances run on a cluster of computers that commonly spread across many racks. Communication between two nodes in different racks has to go through switches. In most cases, network bandwidth between machines in the same rack is greater than network bandwidth between machines in different racks.  
			
 
				+        Large HDFS instances run on a cluster of computers that commonly spread across many racks. Communication 
			
 
				+        between two nodes in different racks has to go through switches. In most cases, network bandwidth between machines 
			
 
				+        in the same rack is greater than network bandwidth between machines in different racks.  
			
 
				         </p>
			
 
				         <p>
			
 
				-        The NameNode determines the rack id each DataNode belongs to via the process outlined in <a href="cluster_setup.html#Hadoop+Rack+Awareness">Rack Awareness</a>. A simple but non-optimal policy is to place replicas on unique racks. This prevents losing data when an entire rack fails and allows use of bandwidth from multiple racks when reading data. This policy evenly distributes replicas in the cluster which makes it easy to balance load on component failure. However, this policy increases the cost of writes because a write needs to transfer blocks to multiple racks. 
			
 
				+        The NameNode determines the rack id each DataNode belongs to via the process outlined in 
			
 
				+        <a href="http://hadoop.apache.org/common/docs/current/cluster_setup.html#Hadoop+Rack+Awareness">Hadoop Rack Awareness</a>. 
			
 
				+        A simple but non-optimal policy is to place replicas on unique racks. This prevents losing data when an entire rack 
			
 
				+        fails and allows use of bandwidth from multiple racks when reading data. This policy evenly distributes replicas in 
			
 
				+        the cluster which makes it easy to balance load on component failure. However, this policy increases the cost of 
			
 
				+        writes because a write needs to transfer blocks to multiple racks. 
			
 
				         </p>
			
 
				         <p>
			
 
				-        For the common case, when the replication factor is three, HDFS&#x2019;s placement policy is to put one replica on one node in the local rack, another on a node in a different (remote) rack, and the last on a different node in the same remote rack. This policy cuts the inter-rack write traffic which generally improves write performance. The chance of rack failure is far less than that of node failure; this policy does not impact data reliability and availability guarantees. However, it does reduce the aggregate network bandwidth used when reading data since a block is placed in only two unique racks rather than three. With this policy, the replicas of a file do not evenly distribute across the racks. One third of replicas are on one node, two thirds of replicas are on one rack, and the other third are evenly distributed across the remaining racks. This policy improves write performance without compromising data reliability or read performance.
			
 
				+        For the common case, when the replication factor is three, HDFS&#x2019;s placement policy is to put one replica 
			
 
				+        on one node in the local rack, another on a node in a different (remote) rack, and the last on a different node in the 
			
 
				+        same remote rack. This policy cuts the inter-rack write traffic which generally improves write performance. The 
			
 
				+        chance of rack failure is far less than that of node failure; this policy does not impact data reliability and availability 
			
 
				+        guarantees. However, it does reduce the aggregate network bandwidth used when reading data since a block is 
			
 
				+        placed in only two unique racks rather than three. With this policy, the replicas of a file do not evenly distribute 
			
 
				+        across the racks. One third of replicas are on one node, two thirds of replicas are on one rack, and the other third 
			
 
				+        are evenly distributed across the remaining racks. This policy improves write performance without compromising 
			
 
				+        data reliability or read performance.
			
 
				         </p>
			
 
				         <p>
			
 
				         The current, default replica placement policy described here is a work in progress.
			
@@ -150,14 +215,24 @@
 
				       <section> 
			
 
				         <title> Replica Selection </title>
			
 
				         <p>
			
 
				-        To minimize global bandwidth consumption and read latency, HDFS tries to satisfy a read request from a replica that is closest to the reader. If there exists a replica on the same rack as the reader node, then that replica is preferred to satisfy the read request. If angg/ HDFS cluster spans multiple data centers, then a replica that is resident in the local data center is preferred over any remote replica.
			
 
				+        To minimize global bandwidth consumption and read latency, HDFS tries to satisfy a read request from a replica 
			
 
				+        that is closest to the reader. If there exists a replica on the same rack as the reader node, then that replica is 
			
 
				+        preferred to satisfy the read request. If angg/ HDFS cluster spans multiple data centers, then a replica that is 
			
 
				+        resident in the local data center is preferred over any remote replica.
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
 
				       <section> 
			
 
				         <title> Safemode </title>
			
 
				         <p>
			
 
				-        On startup, the NameNode enters a special state called Safemode. Replication of data blocks does not occur when the NameNode is in the Safemode state. The NameNode receives Heartbeat and Blockreport messages from the DataNodes. A Blockreport contains the list of data blocks that a DataNode is hosting. Each block has a specified minimum number of replicas. A block is considered safely replicated when the minimum number of replicas of that data block has checked in with the NameNode. After a configurable percentage of safely replicated data blocks checks in with the NameNode (plus an additional 30 seconds), the NameNode exits the Safemode state. It then determines the list of data blocks (if any) that still have fewer than the specified number of replicas. The NameNode then replicates these blocks to other DataNodes.
			
 
				+        On startup, the NameNode enters a special state called Safemode. Replication of data blocks does not occur 
			
 
				+        when the NameNode is in the Safemode state. The NameNode receives Heartbeat and Blockreport messages 
			
 
				+        from the DataNodes. A Blockreport contains the list of data blocks that a DataNode is hosting. Each block 
			
 
				+        has a specified minimum number of replicas. A block is considered safely replicated when the minimum number 
			
 
				+        of replicas of that data block has checked in with the NameNode. After a configurable percentage of safely 
			
 
				+        replicated data blocks checks in with the NameNode (plus an additional 30 seconds), the NameNode exits 
			
 
				+        the Safemode state. It then determines the list of data blocks (if any) that still have fewer than the specified 
			
 
				+        number of replicas. The NameNode then replicates these blocks to other DataNodes.
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
@@ -166,13 +241,32 @@
 
				     <section>
			
 
				       <title> The Persistence of File System Metadata </title>
			
 
				         <p>
			
 
				-        The HDFS namespace is stored by the NameNode. The NameNode uses a transaction log called the EditLog to persistently record every change that occurs to file system metadata. For example, creating a new file in HDFS causes the NameNode to insert a record into the EditLog indicating this. Similarly, changing the replication factor of a file causes a new record to be inserted into the EditLog. The NameNode uses a file in its local host OS file system to store the EditLog. The entire file system namespace, including the mapping of blocks to files and file system properties, is stored in a file called the FsImage. The FsImage is stored as a file in the NameNode&#x2019;s local file system too.
			
 
				+        The HDFS namespace is stored by the NameNode. The NameNode uses a transaction log called the EditLog 
			
 
				+        to persistently record every change that occurs to file system metadata. For example, creating a new file in 
			
 
				+        HDFS causes the NameNode to insert a record into the EditLog indicating this. Similarly, changing the 
			
 
				+        replication factor of a file causes a new record to be inserted into the EditLog. The NameNode uses a file 
			
 
				+        in its local host OS file system to store the EditLog. The entire file system namespace, including the mapping 
			
 
				+        of blocks to files and file system properties, is stored in a file called the FsImage. The FsImage is stored as 
			
 
				+        a file in the NameNode&#x2019;s local file system too.
			
 
				         </p>
			
 
				         <p>
			
 
				-        The NameNode keeps an image of the entire file system namespace and file Blockmap in memory. This key metadata item is designed to be compact, such that a NameNode with 4 GB of RAM is plenty to support a huge number of files and directories. When the NameNode starts up, it reads the FsImage and EditLog from disk, applies all the transactions from the EditLog to the in-memory representation of the FsImage, and flushes out this new version into a new FsImage on disk. It can then truncate the old EditLog because its transactions have been applied to the persistent FsImage. This process is called a checkpoint. In the current implementation, a checkpoint only occurs when the NameNode starts up. Work is in progress to support periodic checkpointing in the near future.
			
 
				+        The NameNode keeps an image of the entire file system namespace and file Blockmap in memory. This key 
			
 
				+        metadata item is designed to be compact, such that a NameNode with 4 GB of RAM is plenty to support a 
			
 
				+        huge number of files and directories. When the NameNode starts up, it reads the FsImage and EditLog from 
			
 
				+        disk, applies all the transactions from the EditLog to the in-memory representation of the FsImage, and flushes 
			
 
				+        out this new version into a new FsImage on disk. It can then truncate the old EditLog because its transactions 
			
 
				+        have been applied to the persistent FsImage. This process is called a checkpoint. In the current implementation, 
			
 
				+        a checkpoint only occurs when the NameNode starts up. Work is in progress to support periodic checkpointing 
			
 
				+        in the near future.
			
 
				         </p>
			
 
				         <p>
			
 
				-        The DataNode stores HDFS data in files in its local file system. The DataNode has no knowledge about HDFS files. It stores each block of HDFS data in a separate file in its local file system. The DataNode does not create all files in the same directory. Instead, it uses a heuristic to determine the optimal number of files per directory and creates subdirectories appropriately. It is not optimal to create all local files in the same directory because the local file system might not be able to efficiently support a huge number of files in a single directory. When a DataNode starts up, it scans through its local file system, generates a list of all HDFS data blocks that correspond to each of these local files and sends this report to the NameNode: this is the Blockreport. 
			
 
				+        The DataNode stores HDFS data in files in its local file system. The DataNode has no knowledge about HDFS files. 
			
 
				+        It stores each block of HDFS data in a separate file in its local file system. The DataNode does not create all files 
			
 
				+        in the same directory. Instead, it uses a heuristic to determine the optimal number of files per directory and creates 
			
 
				+        subdirectories appropriately. It is not optimal to create all local files in the same directory because the local file 
			
 
				+        system might not be able to efficiently support a huge number of files in a single directory. When a DataNode starts 
			
 
				+        up, it scans through its local file system, generates a list of all HDFS data blocks that correspond to each of these 
			
 
				+        local files and sends this report to the NameNode: this is the Blockreport. 
			
 
				         </p>
			
 
				     </section>
			
 
				 
			
@@ -180,7 +274,12 @@
 
				     <section> 
			
 
				       <title> The Communication Protocols </title>
			
 
				       <p>
			
 
				-      All HDFS communication protocols are layered on top of the TCP/IP protocol. A client establishes a connection to a configurable <acronym title="Transmission Control Protocol">TCP</acronym> port on the NameNode machine. It talks the ClientProtocol with the NameNode. The DataNodes talk to the NameNode using the DataNode Protocol. A Remote Procedure Call (<acronym title="Remote Procedure Call">RPC</acronym>) abstraction wraps both the Client Protocol and the DataNode Protocol. By design, the NameNode never initiates any RPCs. Instead, it only responds to RPC requests issued by DataNodes or clients. 
			
 
				+      All HDFS communication protocols are layered on top of the TCP/IP protocol. A client establishes a connection to 
			
 
				+      a configurable <acronym title="Transmission Control Protocol">TCP</acronym> port on the NameNode machine. 
			
 
				+      It talks the ClientProtocol with the NameNode. The DataNodes talk to the NameNode using the DataNode Protocol. 
			
 
				+      A Remote Procedure Call (<acronym title="Remote Procedure Call">RPC</acronym>) abstraction wraps both the 
			
 
				+      Client Protocol and the DataNode Protocol. By design, the NameNode never initiates any RPCs. Instead, it only 
			
 
				+      responds to RPC requests issued by DataNodes or clients. 
			
 
				       </p>
			
 
				     </section>
			
 
				  
			
@@ -188,20 +287,32 @@
 
				     <section> 
			
 
				       <title> Robustness </title>
			
 
				       <p>
			
 
				-      The primary objective of HDFS is to store data reliably even in the presence of failures. The three common types of failures are NameNode failures, DataNode failures and network partitions.
			
 
				+      The primary objective of HDFS is to store data reliably even in the presence of failures. The three common types 
			
 
				+      of failures are NameNode failures, DataNode failures and network partitions.
			
 
				       </p>
			
 
				  
			
 
				       <section>
			
 
				         <title> Data Disk Failure, Heartbeats and Re-Replication </title>
			
 
				         <p>
			
 
				-        Each DataNode sends a Heartbeat message to the NameNode periodically. A network partition can cause a subset of DataNodes to lose connectivity with the NameNode. The NameNode detects this condition by the absence of a Heartbeat message. The NameNode marks DataNodes without recent Heartbeats as dead and does not forward any new <acronym title="Input/Output">IO</acronym> requests to them. Any data that was registered to a dead DataNode is not available to HDFS any more. DataNode death may cause the replication factor of some blocks to fall below their specified value. The NameNode constantly tracks which blocks need to be replicated and initiates replication whenever necessary. The necessity for re-replication may arise due to many reasons: a DataNode may become unavailable, a replica may become corrupted, a hard disk on a DataNode may fail, or the replication factor of a file may be increased. 
			
 
				+        Each DataNode sends a Heartbeat message to the NameNode periodically. A network partition can cause a 
			
 
				+        subset of DataNodes to lose connectivity with the NameNode. The NameNode detects this condition by the 
			
 
				+        absence of a Heartbeat message. The NameNode marks DataNodes without recent Heartbeats as dead and 
			
 
				+        does not forward any new <acronym title="Input/Output">IO</acronym> requests to them. Any data that was 
			
 
				+        registered to a dead DataNode is not available to HDFS any more. DataNode death may cause the replication 
			
 
				+        factor of some blocks to fall below their specified value. The NameNode constantly tracks which blocks need 
			
 
				+        to be replicated and initiates replication whenever necessary. The necessity for re-replication may arise due 
			
 
				+        to many reasons: a DataNode may become unavailable, a replica may become corrupted, a hard disk on a 
			
 
				+        DataNode may fail, or the replication factor of a file may be increased. 
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
 
				       <section>
			
 
				         <title> Cluster Rebalancing </title>
			
 
				         <p>
			
 
				-        The HDFS architecture is compatible with data rebalancing schemes. A scheme might automatically move data from one DataNode to another if the free space on a DataNode falls below a certain threshold. In the event of a sudden high demand for a particular file, a scheme might dynamically create additional replicas and rebalance other data in the cluster. These types of data rebalancing schemes are not yet implemented. 
			
 
				+        The HDFS architecture is compatible with data rebalancing schemes. A scheme might automatically move 
			
 
				+        data from one DataNode to another if the free space on a DataNode falls below a certain threshold. In the 
			
 
				+        event of a sudden high demand for a particular file, a scheme might dynamically create additional replicas 
			
 
				+        and rebalance other data in the cluster. These types of data rebalancing schemes are not yet implemented. 
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
@@ -209,7 +320,13 @@
 
				         <title> Data Integrity </title>
			
 
				         <p>
			
 
				         <!-- XXX "checksum checking" sounds funny -->
			
 
				-        It is possible that a block of data fetched from a DataNode arrives corrupted. This corruption can occur because of faults in a storage device, network faults, or buggy software. The HDFS client software implements checksum checking on the contents of HDFS files. When a client creates an HDFS file, it computes a checksum of each block of the file and stores these checksums in a separate hidden file in the same HDFS namespace. When a client retrieves file contents it verifies that the data it received from each DataNode matches the checksum stored in the associated checksum file. If not, then the client can opt to retrieve that block from another DataNode that has a replica of that block.
			
 
				+        It is possible that a block of data fetched from a DataNode arrives corrupted. This corruption can occur 
			
 
				+        because of faults in a storage device, network faults, or buggy software. The HDFS client software 
			
 
				+        implements checksum checking on the contents of HDFS files. When a client creates an HDFS file, 
			
 
				+        it computes a checksum of each block of the file and stores these checksums in a separate hidden 
			
 
				+        file in the same HDFS namespace. When a client retrieves file contents it verifies that the data it 
			
 
				+        received from each DataNode matches the checksum stored in the associated checksum file. If not, 
			
 
				+        then the client can opt to retrieve that block from another DataNode that has a replica of that block.
			
 
				         </p>
			
 
				       </section>
			
 
				  
			
@@ -217,17 +334,28 @@
 
				       <section>
			
 
				         <title> Metadata Disk Failure </title>
			
 
				         <p>
			
 
				-        The FsImage and the EditLog are central data structures of HDFS. A corruption of these files can cause the HDFS instance to be non-functional. For this reason, the NameNode can be configured to support maintaining multiple copies of the FsImage and EditLog. Any update to either the FsImage or EditLog causes each of the FsImages and EditLogs to get updated synchronously. This synchronous updating of multiple copies of the FsImage and EditLog may degrade the rate of namespace transactions per second that a NameNode can support. However, this degradation is acceptable because even though HDFS applications are very data intensive in nature, they are not metadata intensive. When a NameNode restarts, it selects the latest consistent FsImage and EditLog to use.
			
 
				+        The FsImage and the EditLog are central data structures of HDFS. A corruption of these files can 
			
 
				+        cause the HDFS instance to be non-functional. For this reason, the NameNode can be configured 
			
 
				+        to support maintaining multiple copies of the FsImage and EditLog. Any update to either the FsImage 
			
 
				+        or EditLog causes each of the FsImages and EditLogs to get updated synchronously. This 
			
 
				+        synchronous updating of multiple copies of the FsImage and EditLog may degrade the rate of 
			
 
				+        namespace transactions per second that a NameNode can support. However, this degradation is 
			
 
				+        acceptable because even though HDFS applications are very data intensive in nature, they are not 
			
 
				+        metadata intensive. When a NameNode restarts, it selects the latest consistent FsImage and EditLog to use.
			
 
				         </p>
			
 
				         <p> 
			
 
				-        The NameNode machine is a single point of failure for an HDFS cluster. If the NameNode machine fails, manual intervention is necessary. Currently, automatic restart and failover of the NameNode software to another machine is not supported.
			
 
				+        The NameNode machine is a single point of failure for an HDFS cluster. If the NameNode machine fails, 
			
 
				+        manual intervention is necessary. Currently, automatic restart and failover of the NameNode software to 
			
 
				+        another machine is not supported.
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
 
				       <section>
			
 
				         <title> Snapshots </title>
			
 
				         <p>
			
 
				-        Snapshots support storing a copy of data at a particular instant of time. One usage of the snapshot feature may be to roll back a corrupted HDFS instance to a previously known good point in time. HDFS does not currently support snapshots but will in a future release.
			
 
				+        Snapshots support storing a copy of data at a particular instant of time. One usage of the snapshot 
			
 
				+        feature may be to roll back a corrupted HDFS instance to a previously known good point in time. 
			
 
				+        HDFS does not currently support snapshots but will in a future release.
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
@@ -241,7 +369,11 @@
 
				       <section>
			
 
				         <title> Data Blocks </title>
			
 
				         <p>
			
 
				-        HDFS is designed to support very large files. Applications that are compatible with HDFS are those that deal with large data sets. These applications write their data only once but they read it one or more times and require these reads to be satisfied at streaming speeds. HDFS supports write-once-read-many semantics on files. A typical block size used by HDFS is 64 MB. Thus, an HDFS file is chopped up into 64 MB chunks, and if possible, each chunk will reside on a different DataNode.
			
 
				+        HDFS is designed to support very large files. Applications that are compatible with HDFS are those 
			
 
				+        that deal with large data sets. These applications write their data only once but they read it one or 
			
 
				+        more times and require these reads to be satisfied at streaming speeds. HDFS supports 
			
 
				+        write-once-read-many semantics on files. A typical block size used by HDFS is 64 MB. Thus, 
			
 
				+        an HDFS file is chopped up into 64 MB chunks, and if possible, each chunk will reside on a different DataNode.
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
@@ -250,17 +382,42 @@
 
				         <!-- XXX staging never described / referenced in its section -->
			
 
				         <title> Staging </title>
			
 
				         <p>
			
 
				-        A client request to create a file does not reach the NameNode immediately. In fact, initially the HDFS client caches the file data into a temporary local file. Application writes are transparently redirected to this temporary local file. When the local file accumulates data worth over one HDFS block size, the client contacts the NameNode. The NameNode inserts the file name into the file system hierarchy and allocates a data block for it. The NameNode responds to the client request with the identity of the DataNode and the destination data block. Then the client flushes the block of data from the local temporary file to the specified DataNode. When a file is closed, the remaining un-flushed data in the temporary local file is transferred to the DataNode. The client then tells the NameNode that the file is closed. At this point, the NameNode commits the file creation operation into a persistent store. If the NameNode dies before the file is closed, the file is lost. 
			
 
				+        A client request to create a file does not reach the NameNode immediately. In fact, initially the HDFS 
			
 
				+        client caches the file data into a temporary local file. Application writes are transparently redirected to 
			
 
				+        this temporary local file. When the local file accumulates data worth over one HDFS block size, the 
			
 
				+        client contacts the NameNode. The NameNode inserts the file name into the file system hierarchy 
			
 
				+        and allocates a data block for it. The NameNode responds to the client request with the identity 
			
 
				+        of the DataNode and the destination data block. Then the client flushes the block of data from the 
			
 
				+        local temporary file to the specified DataNode. When a file is closed, the remaining un-flushed data 
			
 
				+        in the temporary local file is transferred to the DataNode. The client then tells the NameNode that 
			
 
				+        the file is closed. At this point, the NameNode commits the file creation operation into a persistent 
			
 
				+        store. If the NameNode dies before the file is closed, the file is lost. 
			
 
				         </p>
			
 
				         <p>
			
 
				-        The above approach has been adopted after careful consideration of target applications that run on HDFS. These applications need streaming writes to files. If a client writes to a remote file directly without any client side buffering, the network speed and the congestion in the network impacts throughput considerably. This approach is not without precedent. Earlier distributed file systems, e.g. <acronym title="Andrew File System">AFS</acronym>, have used client side caching to improve performance. A POSIX requirement has been relaxed to achieve higher performance of data uploads. 
			
 
				+        The above approach has been adopted after careful consideration of target applications that run on 
			
 
				+        HDFS. These applications need streaming writes to files. If a client writes to a remote file directly 
			
 
				+        without any client side buffering, the network speed and the congestion in the network impacts 
			
 
				+        throughput considerably. This approach is not without precedent. Earlier distributed file systems, 
			
 
				+        e.g. <acronym title="Andrew File System">AFS</acronym>, have used client side caching to 
			
 
				+        improve performance. A POSIX requirement has been relaxed to achieve higher performance of 
			
 
				+        data uploads. 
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
 
				       <section>
			
 
				         <title> Replication Pipelining </title>
			
 
				         <p>
			
 
				-        When a client is writing data to an HDFS file, its data is first written to a local file as explained in the previous section. Suppose the HDFS file has a replication factor of three. When the local file accumulates a full block of user data, the client retrieves a list of DataNodes from the NameNode. This list contains the DataNodes that will host a replica of that block. The client then flushes the data block to the first DataNode. The first DataNode starts receiving the data in small portions (4 KB), writes each portion to its local repository and transfers that portion to the second DataNode in the list. The second DataNode, in turn starts receiving each portion of the data block, writes that portion to its repository and then flushes that portion to the third DataNode. Finally, the third DataNode writes the data to its local repository. Thus, a DataNode can be receiving data from the previous one in the pipeline and at the same time forwarding data to the next one in the pipeline. Thus, the data is pipelined from one DataNode to the next.
			
 
				+        When a client is writing data to an HDFS file, its data is first written to a local file as explained 
			
 
				+        in the previous section. Suppose the HDFS file has a replication factor of three. When the local 
			
 
				+        file accumulates a full block of user data, the client retrieves a list of DataNodes from the NameNode. 
			
 
				+        This list contains the DataNodes that will host a replica of that block. The client then flushes the 
			
 
				+        data block to the first DataNode. The first DataNode starts receiving the data in small portions (4 KB), 
			
 
				+        writes each portion to its local repository and transfers that portion to the second DataNode in the list. 
			
 
				+        The second DataNode, in turn starts receiving each portion of the data block, writes that portion to its 
			
 
				+        repository and then flushes that portion to the third DataNode. Finally, the third DataNode writes the 
			
 
				+        data to its local repository. Thus, a DataNode can be receiving data from the previous one in the pipeline 
			
 
				+        and at the same time forwarding data to the next one in the pipeline. Thus, the data is pipelined from 
			
 
				+        one DataNode to the next.
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
@@ -271,26 +428,36 @@
 
				       <title> Accessibility </title>
			
 
				       <!-- XXX Make an API section ? (HTTP is "web service" API?) -->
			
 
				       <p>
			
 
				-      HDFS can be accessed from applications in many different ways. Natively, HDFS provides a <a href="http://hadoop.apache.org/core/docs/current/api/">Java API</a> for applications to use. A C language wrapper for this Java API is also available. In addition, an HTTP browser can also be used to browse the files of an HDFS instance. Work is in progress to expose HDFS through the <acronym title="Web-based Distributed Authoring and Versioning">WebDAV</acronym> protocol. 
			
 
				+      HDFS can be accessed from applications in many different ways. Natively, HDFS provides a 
			
 
				+      <a href="http://hadoop.apache.org/core/docs/current/api/">Java API</a> for applications to 
			
 
				+      use. A C language wrapper for this Java API is also available. In addition, an HTTP browser 
			
 
				+      can also be used to browse the files of an HDFS instance. Work is in progress to expose 
			
 
				+      HDFS through the <acronym title="Web-based Distributed Authoring and Versioning">WebDAV</acronym> protocol. 
			
 
				       </p>
			
 
				 
			
 
				       <section>
			
 
				         <title> FS Shell </title>
			
 
				         <p>
			
 
				-        HDFS allows user data to be organized in the form of files and directories. It provides a commandline interface called  FS shell that lets a user interact with the data in HDFS. The syntax of this command set is similar to other shells (e.g. bash, csh) that users are already familiar with. Here are some sample action/command pairs:
			
 
				+        HDFS allows user data to be organized in the form of files and directories. It provides a commandline 
			
 
				+        interface called  FS shell that lets a user interact with the data in HDFS. The syntax of this command 
			
 
				+        set is similar to other shells (e.g. bash, csh) that users are already familiar with. Here are some sample 
			
 
				+        action/command pairs:
			
 
				         </p>
			
 
				         <table>
			
 
				           <tr>
			
 
				             <th> Action </th><th> Command </th>
			
 
				           </tr>
			
 
				           <tr>
			
 
				-            <td> Create a directory named <code>/foodir</code> </td> <td> <code>bin/hadoop dfs -mkdir /foodir</code> </td>
			
 
				+            <td> Create a directory named <code>/foodir</code> </td> 
			
 
				+            <td> <code>bin/hadoop dfs -mkdir /foodir</code> </td>
			
 
				           </tr>
			
 
				           <tr>
			
 
				-            <td> Remove a directory named <code>/foodir</code> </td> <td> <code>bin/hadoop dfs -rmr /foodir</code> </td>
			
 
				+            <td> Remove a directory named <code>/foodir</code> </td> 
			
 
				+            <td> <code>bin/hadoop dfs -rmr /foodir</code> </td>
			
 
				           </tr>
			
 
				           <tr>
			
 
				-            <td> View the contents of a file named <code>/foodir/myfile.txt</code> </td> <td> <code>bin/hadoop dfs -cat /foodir/myfile.txt</code> </td>
			
 
				+            <td> View the contents of a file named <code>/foodir/myfile.txt</code> </td> 
			
 
				+            <td> <code>bin/hadoop dfs -cat /foodir/myfile.txt</code> </td>
			
 
				           </tr>
			
 
				         </table>
			
 
				         <p>
			
@@ -301,7 +468,8 @@
 
				       <section> 
			
 
				         <title> DFSAdmin </title>
			
 
				         <p>
			
 
				-        The DFSAdmin command set is used for administering an HDFS cluster. These are commands that are used only by an HDFS administrator. Here are some sample action/command pairs:
			
 
				+        The DFSAdmin command set is used for administering an HDFS cluster. These are commands that are 
			
 
				+        used only by an HDFS administrator. Here are some sample action/command pairs:
			
 
				         </p>
			
 
				         <table>
			
 
				           <tr>
			
@@ -314,7 +482,8 @@
 
				             <td> Generate a list of DataNodes </td> <td> <code>bin/hadoop dfsadmin -report</code> </td>
			
 
				           </tr>
			
 
				           <tr>
			
 
				-            <td> Recommission or decommission DataNode(s) </td><td> <code>bin/hadoop dfsadmin -refreshNodes</code> </td>
			
 
				+            <td> Recommission or decommission DataNode(s) </td>
			
 
				+            <td> <code>bin/hadoop dfsadmin -refreshNodes</code> </td>
			
 
				           </tr>
			
 
				         </table>
			
 
				       </section>
			
@@ -322,7 +491,9 @@
 
				       <section> 
			
 
				         <title> Browser Interface </title>
			
 
				         <p>
			
 
				-        A typical HDFS install configures a web server to expose the HDFS namespace through a configurable TCP port. This allows a user to navigate the HDFS namespace and view the contents of its files using a web browser.
			
 
				+        A typical HDFS install configures a web server to expose the HDFS namespace through 
			
 
				+        a configurable TCP port. This allows a user to navigate the HDFS namespace and view 
			
 
				+        the contents of its files using a web browser.
			
 
				        </p>
			
 
				       </section>
			
 
				 
			
@@ -334,17 +505,32 @@
 
				       <section>
			
 
				         <title> File Deletes and Undeletes </title>
			
 
				         <p>
			
 
				-        When a file is deleted by a user or an application, it is not immediately removed from HDFS.  Instead, HDFS first renames it to a file in the <code>/trash</code> directory. The file can be restored quickly as long as it remains in <code>/trash</code>. A file remains in <code>/trash</code> for a configurable amount of time. After the expiry of its life in <code>/trash</code>, the NameNode deletes the file from the HDFS namespace. The deletion of a file causes the blocks associated with the file to be freed. Note that there could be an appreciable time delay between the time a file is deleted by a user and the time of the corresponding increase in free space in HDFS.
			
 
				+        When a file is deleted by a user or an application, it is not immediately removed from HDFS.  Instead, 
			
 
				+        HDFS first renames it to a file in the <code>/trash</code> directory. The file can be restored quickly 
			
 
				+        as long as it remains in <code>/trash</code>. A file remains in <code>/trash</code> for a configurable 
			
 
				+        amount of time. After the expiry of its life in <code>/trash</code>, the NameNode deletes the file from 
			
 
				+        the HDFS namespace. The deletion of a file causes the blocks associated with the file to be freed. 
			
 
				+        Note that there could be an appreciable time delay between the time a file is deleted by a user and 
			
 
				+        the time of the corresponding increase in free space in HDFS.
			
 
				         </p>
			
 
				         <p>
			
 
				-        A user can Undelete a file after deleting it as long as it remains in the <code>/trash</code> directory. If a user wants to undelete a file that he/she has deleted, he/she can navigate the <code>/trash</code> directory and retrieve the file. The <code>/trash</code> directory contains only the latest copy of the file that was deleted. The <code>/trash</code> directory is just like any other directory with one special feature: HDFS applies specified policies to automatically delete files from this directory. The current default policy is to delete files from <code>/trash</code> that are more than 6 hours old. In the future, this policy will be configurable through a well defined interface.
			
 
				+        A user can Undelete a file after deleting it as long as it remains in the <code>/trash</code> directory. 
			
 
				+        If a user wants to undelete a file that he/she has deleted, he/she can navigate the <code>/trash</code> 
			
 
				+        directory and retrieve the file. The <code>/trash</code> directory contains only the latest copy of the file 
			
 
				+        that was deleted. The <code>/trash</code> directory is just like any other directory with one special 
			
 
				+        feature: HDFS applies specified policies to automatically delete files from this directory. The current 
			
 
				+        default policy is to delete files from <code>/trash</code> that are more than 6 hours old. In the future, 
			
 
				+        this policy will be configurable through a well defined interface.
			
 
				         </p>
			
 
				       </section>
			
 
				 
			
 
				       <section>
			
 
				         <title> Decrease Replication Factor </title>
			
 
				         <p>
			
 
				-        When the replication factor of a file is reduced, the NameNode selects excess replicas that can be deleted. The next Heartbeat transfers this information to the DataNode. The DataNode then removes the corresponding blocks and the corresponding free space appears in the cluster. Once again, there might be a time delay between the completion of the <code>setReplication</code> API call and the appearance of free space in the cluster.
			
 
				+        When the replication factor of a file is reduced, the NameNode selects excess replicas that can be deleted. 
			
 
				+        The next Heartbeat transfers this information to the DataNode. The DataNode then removes the corresponding 
			
 
				+        blocks and the corresponding free space appears in the cluster. Once again, there might be a time delay 
			
 
				+        between the completion of the <code>setReplication</code> API call and the appearance of free space in the cluster.
			
 
				         </p>
			
 
				       </section>
			
 
				     </section>
			
@@ -360,8 +546,8 @@
 
				       </p>
			
 
				       <p>
			
 
				       HDFS source code: 
			
 
				-      <a href= "http://hadoop.apache.org/core/version_control.html"> 
			
 
				-        http://hadoop.apache.org/core/version_control.html
			
 
				+      <a href= "http://hadoop.apache.org/hdfs/version_control.html"> 
			
 
				+        http://hadoop.apache.org/hdfs/version_control.html
			
 
				       </a>
			
 
				       </p>
			
 
				     </section> 
			
--- a/src/docs/src/documentation/content/xdocs/hdfs_imageviewer.xml
+++ b/src/docs/src/documentation/content/xdocs/hdfs_imageviewer.xml
@@ -62,8 +62,8 @@
 
				         an fsimage that does not contain any of these fields, the field's column will be included,
			
 
				         but no data recorded. The default record delimiter is a tab, but this may be changed
			
 
				         via the <code>-delimiter</code> command line argument. This processor is designed to
			
 
				-        create output that is easily analyzed by other tools, such as <a href="http://hadoop.apache.org/pig/">Pig</a>. 
			
 
				-        See the <a href="#analysis">Analysis</a> section
			
 
				+        create output that is easily analyzed by other tools, such as <a href="http://hadoop.apache.org/pig/">Apache Pig</a>. 
			
 
				+        See the <a href="#analysis">Analyzing Results</a> section
			
 
				         for further information on using this processor to analyze the contents of fsimage files.</li>
			
 
				         <li><strong>XML</strong> creates an XML document of the fsimage and includes all of the
			
 
				           information within the fsimage, similar to the <code>lsr </code> processor. The output
			
@@ -125,53 +125,103 @@
 
				       <section id="Example">
			
 
				         <title>Example</title>
			
 
				 
			
 
				-          <p>Consider the following contrived namespace:</p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;&nbsp;-&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;21:17&nbsp;/anotherDir</code></p>
			
 
				-          <p><code>-rw-r--r--&nbsp;&nbsp;&nbsp;3&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;286631664&nbsp;2009-03-16&nbsp;21:15&nbsp;/anotherDir/biggerfile</code></p>
			
 
				-          <p><code>-rw-r--r--&nbsp;&nbsp;&nbsp;3&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8754&nbsp;2009-03-16&nbsp;21:17&nbsp;/anotherDir/smallFile</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;&nbsp;-&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;21:11&nbsp;/mapredsystem</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;&nbsp;-&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;21:11&nbsp;/mapredsystem/theuser</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;&nbsp;-&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;21:11&nbsp;/mapredsystem/theuser/mapredsystem</code></p>
			
 
				-          <p><code>drwx-wx-wx&nbsp;&nbsp;&nbsp;-&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;21:11&nbsp;/mapredsystem/theuser/mapredsystem/ip.redacted.com</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;&nbsp;-&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;21:12&nbsp;/one</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;&nbsp;-&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;21:12&nbsp;/one/two</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;&nbsp;-&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;21:16&nbsp;/user</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;&nbsp;-&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;21:19&nbsp;/user/theuser</code></p>
			
 
				-          <p>Applying the Offline Image Processor against this file with default options would result in the following output:</p>
			
 
				-          <p><code>machine:hadoop-0.21.0-dev&nbsp;theuser$&nbsp;bin/hdfs&nbsp;oiv&nbsp;-i&nbsp;fsimagedemo&nbsp;-o&nbsp;fsimage.txt</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:16&nbsp;/</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:17&nbsp;/anotherDir</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:11&nbsp;/mapredsystem</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:12&nbsp;/one</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:16&nbsp;/user</code></p>
			
 
				-          <p><code>-rw-r--r--&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;286631664&nbsp;2009-03-16&nbsp;14:15&nbsp;/anotherDir/biggerfile</code></p>
			
 
				-          <p><code>-rw-r--r--&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8754&nbsp;2009-03-16&nbsp;14:17&nbsp;/anotherDir/smallFile</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:11&nbsp;/mapredsystem/theuser</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:11&nbsp;/mapredsystem/theuser/mapredsystem</code></p>
			
 
				-          <p><code>drwx-wx-wx&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:11&nbsp;/mapredsystem/theuser/mapredsystem/ip.redacted.com</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:12&nbsp;/one/two</code></p>
			
 
				-          <p><code>drwxr-xr-x&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp;theuser&nbsp;supergroup&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;0&nbsp;2009-03-16&nbsp;14:19&nbsp;/user/theuser</code></p>
			
 
				-          <p>Similarly, applying the Indented processor would generate output that begins with:</p>
			
 
				-          <p><code>machine:hadoop-0.21.0-dev&nbsp;theuser$&nbsp;bin/hdfs&nbsp;oiv&nbsp;-i&nbsp;fsimagedemo&nbsp;-p&nbsp;Indented&nbsp;-o&nbsp;fsimage.txt</code></p>
			
 
				-          <p><code>FSImage</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;ImageVersion&nbsp;=&nbsp;-19</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;NamespaceID&nbsp;=&nbsp;2109123098</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;GenerationStamp&nbsp;=&nbsp;1003</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;INodes&nbsp;[NumInodes&nbsp;=&nbsp;12]</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;Inode</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;INodePath&nbsp;=&nbsp;</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Replication&nbsp;=&nbsp;0</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ModificationTime&nbsp;=&nbsp;2009-03-16&nbsp;14:16</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;AccessTime&nbsp;=&nbsp;1969-12-31&nbsp;16:00</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;BlockSize&nbsp;=&nbsp;0</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Blocks&nbsp;[NumBlocks&nbsp;=&nbsp;-1]</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NSQuota&nbsp;=&nbsp;2147483647</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;DSQuota&nbsp;=&nbsp;-1</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Permissions</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Username&nbsp;=&nbsp;theuser</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;GroupName&nbsp;=&nbsp;supergroup</code></p>
			
 
				-          <p><code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;PermString&nbsp;=&nbsp;rwxr-xr-x</code></p>
			
 
				-          <p><code>&hellip;remaining output omitted&hellip;</code></p>
			
 
				+<p>Consider the following contrived namespace:</p>
			
 
				+<source>
			
 
				+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:17 /anotherDir 
			
 
				+
			
 
				+-rw-r--r--   3 theuser supergroup  286631664 2009-03-16 21:15 /anotherDir/biggerfile 
			
 
				+
			
 
				+-rw-r--r--   3 theuser supergroup       8754 2009-03-16 21:17 /anotherDir/smallFile 
			
 
				+
			
 
				+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:11 /mapredsystem 
			
 
				+
			
 
				+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:11 /mapredsystem/theuser 
			
 
				+
			
 
				+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:11 /mapredsystem/theuser/mapredsystem 
			
 
				+
			
 
				+drwx-wx-wx   - theuser supergroup          0 2009-03-16 21:11 /mapredsystem/theuser/mapredsystem/ip.redacted.com 
			
 
				+
			
 
				+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:12 /one 
			
 
				+
			
 
				+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:12 /one/two 
			
 
				+
			
 
				+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:16 /user 
			
 
				+
			
 
				+drwxr-xr-x   - theuser supergroup          0 2009-03-16 21:19 /user/theuser 
			
 
				+</source>          
			
 
				+
			
 
				+<p>Applying the Offline Image Processor against this file with default options would result in the following output:</p>
			
 
				+<source>
			
 
				+machine:hadoop-0.21.0-dev theuser$ bin/hdfs oiv -i fsimagedemo -o fsimage.txt 
			
 
				+
			
 
				+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:16 / 
			
 
				+
			
 
				+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:17 /anotherDir 
			
 
				+
			
 
				+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:11 /mapredsystem 
			
 
				+
			
 
				+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:12 /one 
			
 
				+
			
 
				+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:16 /user 
			
 
				+
			
 
				+-rw-r--r--  3   theuser supergroup    286631664 2009-03-16 14:15 /anotherDir/biggerfile 
			
 
				+
			
 
				+-rw-r--r--  3   theuser supergroup         8754 2009-03-16 14:17 /anotherDir/smallFile 
			
 
				+
			
 
				+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:11 /mapredsystem/theuser 
			
 
				+
			
 
				+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:11 /mapredsystem/theuser/mapredsystem 
			
 
				+
			
 
				+drwx-wx-wx  -   theuser supergroup            0 2009-03-16 14:11 /mapredsystem/theuser/mapredsystem/ip.redacted.com 
			
 
				+
			
 
				+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:12 /one/two 
			
 
				+
			
 
				+drwxr-xr-x  -   theuser supergroup            0 2009-03-16 14:19 /user/theuser 
			
 
				+</source>
			
 
				+
			
 
				+<p>Similarly, applying the Indented processor would generate output that begins with:</p>
			
 
				+<source>
			
 
				+machine:hadoop-0.21.0-dev theuser$ bin/hdfs oiv -i fsimagedemo -p Indented -o fsimage.txt 
			
 
				+
			
 
				+FSImage 
			
 
				+
			
 
				+  ImageVersion = -19 
			
 
				+
			
 
				+  NamespaceID = 2109123098 
			
 
				+
			
 
				+  GenerationStamp = 1003 
			
 
				+
			
 
				+  INodes [NumInodes = 12] 
			
 
				+
			
 
				+    Inode 
			
 
				+
			
 
				+      INodePath =  
			
 
				+
			
 
				+      Replication = 0 
			
 
				+
			
 
				+      ModificationTime = 2009-03-16 14:16 
			
 
				+
			
 
				+      AccessTime = 1969-12-31 16:00 
			
 
				+
			
 
				+      BlockSize = 0 
			
 
				+
			
 
				+      Blocks [NumBlocks = -1] 
			
 
				+
			
 
				+      NSQuota = 2147483647 
			
 
				+
			
 
				+      DSQuota = -1 
			
 
				+
			
 
				+      Permissions 
			
 
				+
			
 
				+        Username = theuser 
			
 
				+
			
 
				+        GroupName = supergroup 
			
 
				+
			
 
				+        PermString = rwxr-xr-x 
			
 
				+
			
 
				+���remaining output omitted���
			
 
				+</source>          
			
 
				+          
			
 
				       </section> <!-- example-->
			
 
				 
			
 
				     </section>
			
@@ -210,7 +260,7 @@
 
				     </section>
			
 
				    
			
 
				     <section id="analysis">
			
 
				-      <title>Analyzing results of Offline Image Viewer</title>
			
 
				+      <title>Analyzing Results</title>
			
 
				       <p>The Offline Image Viewer makes it easy to gather large amounts of data about the hdfs namespace.
			
 
				          This information can then be used to explore file system usage patterns or find
			
 
				         specific files that match arbitrary criteria, along with other types of namespace analysis. The Delimited 
			
@@ -227,7 +277,7 @@
 
				       <p>Each of the following scripts assumes you have generated an output file using the Delimited processor named
			
 
				         <code>foo</code> and will be storing the results of the Pig analysis in a file named <code>results</code>.</p>
			
 
				       <section>
			
 
				-      <title>Total number of files for each user</title>
			
 
				+      <title>Total Number of Files for Each User</title>
			
 
				       <p>This script processes each path within the namespace, groups them by the file owner and determines the total
			
 
				       number of files each user owns.</p>
			
 
				       <p><strong>numFilesOfEachUser.pig:</strong></p>
			
@@ -270,7 +320,8 @@ STORE C INTO '$outputFile';
 
				         <code>marge 2456</code><br/>
			
 
				       </p>
			
 
				       </section>
			
 
				-      <section><title>Files that have never been accessed</title>
			
 
				+      
			
 
				+      <section><title>Files That Have Never Been Accessed</title>
			
 
				       <p>This script finds files that were created but whose access times were never changed, meaning they were never opened or viewed.</p>
			
 
				             <p><strong>neverAccessed.pig:</strong></p>
			
 
				       <source>
			
@@ -306,7 +357,7 @@ STORE D INTO '$outputFile';
 
				       <p>This script can be run against pig with the following command and its output file's content will be a list of files that were created but never viewed afterwards.</p>
			
 
				       <p><code>bin/pig -x local -param inputFile=../foo -param outputFile=../results ../neverAccessed.pig</code><br/></p>
			
 
				       </section>
			
 
				-      <section><title>Probable duplicated files based on file size</title>
			
 
				+      <section><title>Probable Duplicated Files Based on File Size</title>
			
 
				       <p>This script groups files together based on their size, drops any that are of less than 100mb and returns a list of the file size, number of files found and a tuple of the file paths.  This can be used to find likely duplicates within the filesystem namespace.</p>
			
 
				       
			
 
				             <p><strong>probableDuplicates.pig:</strong></p>
			
@@ -357,14 +408,16 @@ STORE G INTO '$outputFile';
 
				       <p>This script can be run against pig with the following command:</p>
			
 
				       <p><code>bin/pig -x local -param inputFile=../foo -param outputFile=../results ../probableDuplicates.pig</code><br/></p>
			
 
				       <p> The output file's content will be similar to that below:</p>
			
 
				-      <p>
			
 
				-        <code>1077288632  2 {(/user/tennant/work1/part-00501),(/user/tennant/work1/part-00993)}</code><br/>
			
 
				-        <code>1077288664  4 {(/user/tennant/work0/part-00567),(/user/tennant/work0/part-03980),(/user/tennant/work1/part-00725),(/user/eccelston/output/part-03395)}</code><br/>
			
 
				-        <code>1077288668  3 {(/user/tennant/work0/part-03705),(/user/tennant/work0/part-04242),(/user/tennant/work1/part-03839)}</code><br/>
			
 
				-        <code>1077288698  2 {(/user/tennant/work0/part-00435),(/user/eccelston/output/part-01382)}</code><br/>
			
 
				-        <code>1077288702  2 {(/user/tennant/work0/part-03864),(/user/eccelston/output/part-03234)}</code><br/>
			
 
				-      </p>
			
 
				-      <p>Each line includes the file size in bytes that was found to be duplicated, the number of duplicates found, and a list of the duplicated paths. Files less than 100MB are ignored, providing a reasonable likelihood that files of these exact sizes may be duplicates.</p>
			
 
				+      
			
 
				+<source>
			
 
				+1077288632 2 {(/user/tennant/work1/part-00501),(/user/tennant/work1/part-00993)} 
			
 
				+1077288664 4 {(/user/tennant/work0/part-00567),(/user/tennant/work0/part-03980),(/user/tennant/work1/part-00725),(/user/eccelston/output/part-03395)} 
			
 
				+1077288668 3 {(/user/tennant/work0/part-03705),(/user/tennant/work0/part-04242),(/user/tennant/work1/part-03839)} 
			
 
				+1077288698 2 {(/user/tennant/work0/part-00435),(/user/eccelston/output/part-01382)} 
			
 
				+1077288702 2 {(/user/tennant/work0/part-03864),(/user/eccelston/output/part-03234)} 
			
 
				+</source>      
			
 
				+      <p>Each line includes the file size in bytes that was found to be duplicated, the number of duplicates found, and a list of the duplicated paths. 
			
 
				+      Files less than 100MB are ignored, providing a reasonable likelihood that files of these exact sizes may be duplicates.</p>
			
 
				       </section>
			
 
				     </section>
			
 
				 
			
--- a/src/docs/src/documentation/content/xdocs/hdfs_permissions_guide.xml
+++ b/src/docs/src/documentation/content/xdocs/hdfs_permissions_guide.xml
@@ -24,17 +24,33 @@
 
				 
			
 
				   <header>
			
 
				     <title>
			
 
				-      HDFS Permissions Guide
			
 
				+      Permissions Guide
			
 
				     </title>
			
 
				   </header>
			
 
				 
			
 
				   <body>
			
 
				     <section> <title>Overview</title>
			
 
				       <p>
			
 
				-		The Hadoop Distributed File System (HDFS) implements a permissions model for files and directories that shares much of the POSIX model. Each file and directory is associated with an <em>owner</em> and a <em>group</em>. The file or directory has separate permissions for the user that is the owner, for other users that are members of the group, and for all other users. For files, the <em>r</em> permission is required to read the file, and the <em>w</em> permission is required to write or append to the file. For directories, the <em>r</em> permission is required to list the contents of the directory, the <em>w</em> permission is required to create or delete files or directories, and the <em>x</em> permission is required to access a child of the directory. In contrast to the POSIX model, there are no <em>setuid</em> or <em>setgid</em> bits for files as there is no notion of executable files. For directories, there are no <em>setuid</em> or <em>setgid</em> bits directory as a simplification. The <em>Sticky bit</em> can be set on directories, preventing anyone except the superuser, directory owner or file owner from deleting or moving the files within the directory. Setting the sticky bit for a file has no effect. Collectively, the permissions of a file or directory are its <em>mode</em>. In general, Unix customs for representing and displaying modes will be used, including the use of octal numbers in this description. When a file or directory is created, its owner is the user identity of the client process, and its group is the group of the parent directory (the BSD rule).
			
 
				+		The Hadoop Distributed File System (HDFS) implements a permissions model for files and directories that shares much of the POSIX model. 
			
 
				+		Each file and directory is associated with an <em>owner</em> and a <em>group</em>. The file or directory has separate permissions for the 
			
 
				+		user that is the owner, for other users that are members of the group, and for all other users. 
			
 
				+		
			
 
				+		For files, the <em>r</em> permission is required to read the file, and the <em>w</em> permission is required to write or append to the file. 
			
 
				+		
			
 
				+		For directories, the <em>r</em> permission is required to list the contents of the directory, the <em>w</em> permission is required to create 
			
 
				+		or delete files or directories, and the <em>x</em> permission is required to access a child of the directory. 
			
 
				+		</p>
			
 
				+	 <p>	
			
 
				+		In contrast to the POSIX model, there are no <em>setuid</em> or <em>setgid</em> bits for files as there is no notion of executable files. 
			
 
				+		For directories, there are no <em>setuid</em> or <em>setgid</em> bits directory as a simplification. The <em>Sticky bit</em> can be set 
			
 
				+		on directories, preventing anyone except the superuser, directory owner or file owner from deleting or moving the files within the directory. 
			
 
				+		Setting the sticky bit for a file has no effect. Collectively, the permissions of a file or directory are its <em>mode</em>. In general, Unix 
			
 
				+		customs for representing and displaying modes will be used, including the use of octal numbers in this description. When a file or directory 
			
 
				+		is created, its owner is the user identity of the client process, and its group is the group of the parent directory (the BSD rule).
			
 
				 	</p>
			
 
				 	<p>
			
 
				-		Each client process that accesses HDFS has a two-part identity composed of the <em>user name</em>, and <em>groups list</em>. Whenever HDFS must do a permissions check for a file or directory <code>foo</code> accessed by a client process,
			
 
				+		Each client process that accesses HDFS has a two-part identity composed of the <em>user name</em>, and <em>groups list</em>. 
			
 
				+		Whenever HDFS must do a permissions check for a file or directory <code>foo</code> accessed by a client process,
			
 
				 	</p>
			
 
				 	<ul>
			
 
				 		<li>
			
@@ -67,22 +83,34 @@ In this release of Hadoop the identity of a client process is just whatever the
 
				 </ul>
			
 
				 
			
 
				 <p>
			
 
				-In the future there will be other ways of establishing user identity (think Kerberos, LDAP, and others). There is no expectation that this first method is secure in protecting one user from impersonating another. This user identity mechanism combined with the permissions model allows a cooperative community to share file system resources in an organized fashion.
			
 
				+In the future there will be other ways of establishing user identity (think Kerberos, LDAP, and others). There is no expectation that 
			
 
				+this first method is secure in protecting one user from impersonating another. This user identity mechanism combined with the 
			
 
				+permissions model allows a cooperative community to share file system resources in an organized fashion.
			
 
				 </p>
			
 
				 <p>
			
 
				-In any case, the user identity mechanism is extrinsic to HDFS itself. There is no provision within HDFS for creating user identities, establishing groups, or processing user credentials.
			
 
				+In any case, the user identity mechanism is extrinsic to HDFS itself. There is no provision within HDFS for creating user identities, 
			
 
				+establishing groups, or processing user credentials.
			
 
				 </p>
			
 
				 </section>
			
 
				 
			
 
				 <section> <title>Understanding the Implementation</title>
			
 
				 <p>
			
 
				-Each file or directory operation passes the full path name to the name node, and the permissions checks are applied along the path for each operation. The client framework will implicitly associate the user identity with the connection to the name node, reducing the need for changes to the existing client API. It has always been the case that when one operation on a file succeeds, the operation might fail when repeated because the file, or some directory on the path, no longer exists. For instance, when the client first begins reading a file, it makes a first request to the name node to discover the location of the first blocks of the file. A second request made to find additional blocks may fail. On the other hand, deleting a file does not revoke access by a client that already knows the blocks of the file. With the addition of permissions, a client's access to a file may be withdrawn between requests. Again, changing permissions does not revoke the access of a client that already knows the file's blocks.
			
 
				+Each file or directory operation passes the full path name to the name node, and the permissions checks are applied along the 
			
 
				+path for each operation. The client framework will implicitly associate the user identity with the connection to the name node, 
			
 
				+reducing the need for changes to the existing client API. It has always been the case that when one operation on a file succeeds, 
			
 
				+the operation might fail when repeated because the file, or some directory on the path, no longer exists. For instance, when the 
			
 
				+client first begins reading a file, it makes a first request to the name node to discover the location of the first blocks of the file. 
			
 
				+A second request made to find additional blocks may fail. On the other hand, deleting a file does not revoke access by a client 
			
 
				+that already knows the blocks of the file. With the addition of permissions, a client's access to a file may be withdrawn between 
			
 
				+requests. Again, changing permissions does not revoke the access of a client that already knows the file's blocks.
			
 
				 </p>
			
 
				 <p>
			
 
				-The map-reduce framework delegates the user identity by passing strings without special concern for confidentiality. The owner and group of a file or directory are stored as strings; there is no conversion from user and group identity numbers as is conventional in Unix.
			
 
				+The MapReduce framework delegates the user identity by passing strings without special concern for confidentiality. The owner 
			
 
				+and group of a file or directory are stored as strings; there is no conversion from user and group identity numbers as is conventional in Unix.
			
 
				 </p>
			
 
				 <p>
			
 
				-The permissions features of this release did not require any changes to the behavior of data nodes. Blocks on the data nodes do not have any of the <em>Hadoop</em> ownership or permissions attributes associated with them.
			
 
				+The permissions features of this release did not require any changes to the behavior of data nodes. Blocks on the data nodes 
			
 
				+do not have any of the <em>Hadoop</em> ownership or permissions attributes associated with them.
			
 
				 </p>
			
 
				 </section>
			
 
				      
			
@@ -93,7 +121,8 @@ The permissions features of this release did not require any changes to the beha
 
				 <p>New methods:</p>
			
 
				 <ul>
			
 
				 	<li>
			
 
				-		<code>public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException;</code>
			
 
				+		<code>public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short 
			
 
				+		replication, long blockSize, Progressable progress) throws IOException;</code>
			
 
				 	</li>
			
 
				 	<li>
			
 
				 		<code>public boolean mkdirs(Path f, FsPermission permission) throws IOException;</code>
			
@@ -105,84 +134,115 @@ The permissions features of this release did not require any changes to the beha
 
				 		<code>public void setOwner(Path p, String username, String groupname) throws IOException;</code>
			
 
				 	</li>
			
 
				 	<li>
			
 
				-		<code>public FileStatus getFileStatus(Path f) throws IOException;</code> will additionally return the user, group and mode associated with the path.
			
 
				+		<code>public FileStatus getFileStatus(Path f) throws IOException;</code> will additionally return the user, 
			
 
				+		group and mode associated with the path.
			
 
				 	</li>
			
 
				 
			
 
				 </ul>
			
 
				 <p>
			
 
				-The mode of a new file or directory is restricted my the <code>umask</code> set as a configuration parameter. When the existing <code>create(path, &hellip;)</code> method (<em>without</em> the permission parameter) is used, the mode of the new file is <code>666&thinsp;&amp;&thinsp;^umask</code>. When the new <code>create(path, </code><em>permission</em><code>, &hellip;)</code> method (<em>with</em> the permission parameter <em>P</em>) is used, the mode of the new file is <code>P&thinsp;&amp;&thinsp;^umask&thinsp;&amp;&thinsp;666</code>. When a new directory is created with the existing <code>mkdirs(path)</code> method (<em>without</em> the permission parameter), the mode of the new directory is <code>777&thinsp;&amp;&thinsp;^umask</code>. When the new <code>mkdirs(path, </code><em>permission</em> <code>)</code> method (<em>with</em> the permission parameter <em>P</em>) is used, the mode of new directory is <code>P&thinsp;&amp;&thinsp;^umask&thinsp;&amp;&thinsp;777</code>. 
			
 
				+The mode of a new file or directory is restricted my the <code>umask</code> set as a configuration parameter. 
			
 
				+When the existing <code>create(path, &hellip;)</code> method (<em>without</em> the permission parameter) 
			
 
				+is used, the mode of the new file is <code>666&thinsp;&amp;&thinsp;^umask</code>. When the 
			
 
				+new <code>create(path, </code><em>permission</em><code>, &hellip;)</code> method 
			
 
				+(<em>with</em> the permission parameter <em>P</em>) is used, the mode of the new file is 
			
 
				+<code>P&thinsp;&amp;&thinsp;^umask&thinsp;&amp;&thinsp;666</code>. When a new directory is 
			
 
				+created with the existing <code>mkdirs(path)</code> method (<em>without</em> the permission parameter), 
			
 
				+the mode of the new directory is <code>777&thinsp;&amp;&thinsp;^umask</code>. When the 
			
 
				+new <code>mkdirs(path, </code><em>permission</em> <code>)</code> method (<em>with</em> the 
			
 
				+permission parameter <em>P</em>) is used, the mode of new directory is 
			
 
				+<code>P&thinsp;&amp;&thinsp;^umask&thinsp;&amp;&thinsp;777</code>. 
			
 
				 </p>
			
 
				 </section>
			
 
				 
			
 
				      
			
 
				 <section> <title>Changes to the Application Shell</title>
			
 
				 <p>New operations:</p>
			
 
				-<dl>
			
 
				-	<dt><code>chmod [-R]</code> <em>mode file &hellip;</em></dt>
			
 
				-	<dd>
			
 
				-		Only the owner of a file or the super-user is permitted to change the mode of a file.
			
 
				-	</dd>
			
 
				-	<dt><code>chgrp [-R]</code> <em>group file &hellip;</em></dt>
			
 
				-	<dd>
			
 
				-		The user invoking <code>chgrp</code> must belong to the specified group and be the owner of the file, or be the super-user.
			
 
				-	</dd>
			
 
				-	<dt><code>chown [-R]</code> <em>[owner][:[group]] file &hellip;</em></dt>
			
 
				-	<dd>
			
 
				-		The owner of a file may only be altered by a super-user.
			
 
				-	</dd>
			
 
				-	<dt><code>ls </code> <em>file &hellip;</em></dt><dd></dd>
			
 
				-	<dt><code>lsr </code> <em>file &hellip;</em></dt>
			
 
				-	<dd>
			
 
				-		The output is reformatted to display the owner, group and mode.
			
 
				-	</dd>
			
 
				-</dl></section>
			
 
				+<ul>
			
 
				+	<li><code>chmod [-R]</code> <em>mode file &hellip;</em>
			
 
				+	<br />Only the owner of a file or the super-user is permitted to change the mode of a file.
			
 
				+    </li>
			
 
				+    
			
 
				+	<li><code>chgrp [-R]</code> <em>group file &hellip;</em>
			
 
				+	<br />The user invoking <code>chgrp</code> must belong to the specified group and be the owner of the file, or be the super-user.
			
 
				+    </li>
			
 
				+    
			
 
				+	<li><code>chown [-R]</code> <em>[owner][:[group]] file &hellip;</em>
			
 
				+    <br />The owner of a file may only be altered by a super-user.
			
 
				+    </li>
			
 
				+	
			
 
				+	<li><code>ls </code> <em>file &hellip;</em>
			
 
				+	</li>
			
 
				+
			
 
				+	<li><code>lsr </code> <em>file &hellip;</em>
			
 
				+    <br />The output is reformatted to display the owner, group and mode.
			
 
				+	</li>
			
 
				+</ul>
			
 
				+</section>
			
 
				 
			
 
				      
			
 
				 <section> <title>The Super-User</title>
			
 
				 <p>
			
 
				-	The super-user is the user with the same identity as name node process itself. Loosely, if you started the name node, then you are the super-user. The super-user can do anything in that permissions checks never fail for the super-user. There is no persistent notion of who <em>was</em> the super-user; when the name node is started the process identity determines who is the super-user <em>for now</em>. The HDFS super-user does not have to be the super-user of the name node host, nor is it necessary that all clusters have the same super-user. Also, an experimenter running HDFS on a personal workstation, conveniently becomes that installation's super-user without any configuration.
			
 
				+	The super-user is the user with the same identity as name node process itself. Loosely, if you started the name 
			
 
				+	node, then you are the super-user. The super-user can do anything in that permissions checks never fail for the 
			
 
				+	super-user. There is no persistent notion of who <em>was</em> the super-user; when the name node is started 
			
 
				+	the process identity determines who is the super-user <em>for now</em>. The HDFS super-user does not have 
			
 
				+	to be the super-user of the name node host, nor is it necessary that all clusters have the same super-user. Also, 
			
 
				+	an experimenter running HDFS on a personal workstation, conveniently becomes that installation's super-user 
			
 
				+	without any configuration.
			
 
				 	</p>
			
 
				 	<p>
			
 
				-	In addition, the administrator my identify a distinguished group using a configuration parameter. If set, members of this group are also super-users.
			
 
				+	In addition, the administrator my identify a distinguished group using a configuration parameter. If set, members 
			
 
				+	of this group are also super-users.
			
 
				 </p>
			
 
				 </section>
			
 
				 
			
 
				 <section> <title>The Web Server</title>
			
 
				 <p>
			
 
				-The identity of the web server is a configuration parameter. That is, the name node has no notion of the identity of the <em>real</em> user, but the web server behaves as if it has the identity (user and groups) of a user chosen by the administrator. Unless the chosen identity matches the super-user, parts of the name space may be invisible to the web server.</p>
			
 
				+The identity of the web server is a configuration parameter. That is, the name node has no notion of the identity of 
			
 
				+the <em>real</em> user, but the web server behaves as if it has the identity (user and groups) of a user chosen 
			
 
				+by the administrator. Unless the chosen identity matches the super-user, parts of the name space may be invisible 
			
 
				+to the web server.</p>
			
 
				 </section>
			
 
				 
			
 
				 <section> <title>On-line Upgrade</title>
			
 
				 <p>
			
 
				-If a cluster starts with a version 0.15 data set (<code>fsimage</code>), all files and directories will have owner <em>O</em>, group <em>G</em>, and mode <em>M</em>, where <em>O</em> and <em>G</em> are the user and group identity of the super-user, and <em>M</em> is a configuration parameter. </p>
			
 
				+If a cluster starts with a version 0.15 data set (<code>fsimage</code>), all files and directories will have 
			
 
				+owner <em>O</em>, group <em>G</em>, and mode <em>M</em>, where <em>O</em> and <em>G</em> 
			
 
				+are the user and group identity of the super-user, and <em>M</em> is a configuration parameter. </p>
			
 
				 </section>
			
 
				 
			
 
				 <section> <title>Configuration Parameters</title>
			
 
				-<dl>
			
 
				-	<dt><code>dfs.permissions = true </code></dt>
			
 
				-	<dd>
			
 
				-		If <code>yes</code> use the permissions system as described here. If <code>no</code>, permission <em>checking</em> is turned off, but all other behavior is unchanged. Switching from one parameter value to the other does not change the mode, owner or group of files or directories.
			
 
				-		<p>
			
 
				-		</p>
			
 
				-		Regardless of whether permissions are on or off, <code>chmod</code>, <code>chgrp</code> and <code>chown</code> <em>always</em> check permissions. These functions are only useful in the permissions context, and so there is no backwards compatibility issue. Furthermore, this allows administrators to reliably set owners and permissions in advance of turning on regular permissions checking.
			
 
				-	</dd>
			
 
				-	<dt><code>dfs.web.ugi = webuser,webgroup</code></dt>
			
 
				-	<dd>
			
 
				-		The user name to be used by the web server. Setting this to the name of the super-user allows any web client to see everything. Changing this to an otherwise unused identity allows web clients to see only those things visible using "other" permissions. Additional groups may be added to the comma-separated list.
			
 
				-	</dd>
			
 
				-	<dt><code>dfs.permissions.supergroup = supergroup</code></dt>
			
 
				-	<dd>
			
 
				-		The name of the group of super-users.
			
 
				-	</dd>
			
 
				-	<dt><code>dfs.upgrade.permission = 0777</code></dt>
			
 
				-	<dd>
			
 
				-		The choice of initial mode during upgrade. The <em>x</em> permission is <em>never</em> set for files. For configuration files, the decimal value <em>511<sub>10</sub></em> may be used.
			
 
				-	</dd>
			
 
				-	<dt><code>dfs.umask = 022</code></dt>
			
 
				-	<dd>
			
 
				-		The <code>umask</code> used when creating files and directories. For configuration files, the decimal value <em>18<sub>10</sub></em> may be used.
			
 
				-	</dd>
			
 
				-</dl>
			
 
				+<ul>
			
 
				+	<li><code>dfs.permissions = true </code>
			
 
				+		<br />If <code>yes</code> use the permissions system as described here. If <code>no</code>, permission 
			
 
				+		<em>checking</em> is turned off, but all other behavior is unchanged. Switching from one parameter 
			
 
				+		value to the other does not change the mode, owner or group of files or directories.
			
 
				+		<br />Regardless of whether permissions are on or off, <code>chmod</code>, <code>chgrp</code> and 
			
 
				+		<code>chown</code> <em>always</em> check permissions. These functions are only useful in the 
			
 
				+		permissions context, and so there is no backwards compatibility issue. Furthermore, this allows 
			
 
				+		administrators to reliably set owners and permissions in advance of turning on regular permissions checking.
			
 
				+    </li>
			
 
				+
			
 
				+	<li><code>dfs.web.ugi = webuser,webgroup</code>
			
 
				+	<br />The user name to be used by the web server. Setting this to the name of the super-user allows any 
			
 
				+		web client to see everything. Changing this to an otherwise unused identity allows web clients to see 
			
 
				+		only those things visible using "other" permissions. Additional groups may be added to the comma-separated list.
			
 
				+    </li>
			
 
				+    
			
 
				+	<li><code>dfs.permissions.supergroup = supergroup</code>
			
 
				+	<br />The name of the group of super-users.
			
 
				+	</li>
			
 
				+
			
 
				+	<li><code>dfs.upgrade.permission = 0777</code>
			
 
				+	<br />The choice of initial mode during upgrade. The <em>x</em> permission is <em>never</em> set for files. 
			
 
				+		For configuration files, the decimal value <em>511<sub>10</sub></em> may be used.
			
 
				+    </li>
			
 
				+    
			
 
				+	<li><code>dfs.umask = 022</code>
			
 
				+    <br />The <code>umask</code> used when creating files and directories. For configuration files, the decimal 
			
 
				+		value <em>18<sub>10</sub></em> may be used.
			
 
				+	</li>
			
 
				+</ul>
			
 
				 </section>
			
 
				 
			
 
				      
			
--- a/src/docs/src/documentation/content/xdocs/hdfs_quota_admin_guide.xml
+++ b/src/docs/src/documentation/content/xdocs/hdfs_quota_admin_guide.xml
@@ -20,13 +20,16 @@
 
				 
			
 
				 <document>
			
 
				 
			
 
				- <header> <title> HDFS Quotas Guide</title> </header>
			
 
				+ <header> <title>Quotas Guide</title> </header>
			
 
				 
			
 
				  <body>
			
 
				+ 
			
 
				+ <section> <title>Overview</title>
			
 
				 
			
 
				- <p> The Hadoop Distributed File System (HDFS) allows the administrator to set quotas for the number of names used and the
			
 
				+ <p> The Hadoop Distributed File System (HDFS) allows the <strong>administrator</strong> to set quotas for the number of names used and the
			
 
				 amount of space used for individual directories. Name quotas and space quotas operate independently, but the administration and
			
 
				 implementation of the two types of quotas are closely parallel. </p>
			
 
				+</section>
			
 
				 
			
 
				 <section> <title>Name Quotas</title>
			
 
				 
			
--- a/src/docs/src/documentation/content/xdocs/hdfs_shell.xml
+++ b/src/docs/src/documentation/content/xdocs/hdfs_shell.xml
@@ -1,465 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-<document>
			
 
				-	<header>
			
 
				-		<title>HDFS File System Shell Guide</title>
			
 
				-	</header>
			
 
				-	<body>
			
 
				-		<section>
			
 
				-			<title>Overview</title>
			
 
				-			<p>
			
 
				-      The FileSystem (FS) shell is invoked by 
			
 
				-      <code>bin/hadoop fs &lt;args&gt;</code>.
			
 
				-      All FS shell commands take path URIs as arguments. The URI
			
 
				-      format is <em>scheme://autority/path</em>. For HDFS the scheme
			
 
				-      is <em>hdfs</em>, and for the local filesystem the scheme
			
 
				-      is <em>file</em>. The scheme and authority are optional. If not
			
 
				-      specified, the default scheme specified in the configuration is
			
 
				-      used. An HDFS file or directory such as <em>/parent/child</em>
			
 
				-      can be specified as <em>hdfs://namenodehost/parent/child</em> or
			
 
				-      simply as <em>/parent/child</em> (given that your configuration
			
 
				-      is set to point to <em>hdfs://namenodehost</em>). Most of the
			
 
				-      commands in FS shell behave like corresponding Unix
			
 
				-      commands. Differences are described with each of the
			
 
				-      commands. Error information is sent to <em>stderr</em> and the
			
 
				-      output is sent to <em>stdout</em>.
			
 
				-  </p>
			
 
				-		<section>
			
 
				-			<title> cat </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -cat URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-		   Copies source paths to <em>stdout</em>. 
			
 
				-		   </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -cat hdfs://nn1.example.com/file1 hdfs://nn2.example.com/file2 
			
 
				-		   </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code>hadoop fs -cat file:///file3 /user/hadoop/file4 </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:<br/>
			
 
				-		   <code> Returns 0 on success and -1 on error. </code></p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> chgrp </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -chgrp [-R] GROUP URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Change group association of files. With <code>-R</code>, make the change recursively through the directory structure. The user must be the owner of files, or else a super-user. Additional information is in the <a href="hdfs_permissions_guide.html">HDFS Admin Guide: Permissions</a>.
			
 
				-	    </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> chmod </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -chmod [-R] &lt;MODE[,MODE]... | OCTALMODE&gt; URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Change the permissions of files. With <code>-R</code>, make the change recursively through the directory structure. The user must be the owner of the file, or else a super-user. Additional information is in the <a href="hdfs_permissions_guide.html">HDFS Admin Guide: Permissions</a>.
			
 
				-	    </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> chown </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -chown [-R] [OWNER][:[GROUP]] URI [URI ]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Change the owner of files. With <code>-R</code>, make the change recursively through the directory structure. The user must be a super-user. Additional information is in the <a href="hdfs_permissions_guide.html">HDFS Admin Guide: Permissions</a>.
			
 
				-	    </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title>copyFromLocal</title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -copyFromLocal &lt;localsrc&gt; URI</code>
			
 
				-			</p>
			
 
				-			<p>Similar to <a href="#put"><strong>put</strong></a> command, except that the source is restricted to a local file reference. </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> copyToLocal</title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -copyToLocal [-ignorecrc] [-crc] URI &lt;localdst&gt;</code>
			
 
				-			</p>
			
 
				-			<p> Similar to <a href="#get"><strong>get</strong></a> command, except that the destination is restricted to a local file reference.</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> count </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -count [-q]  &lt;paths&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-				Count the number of directories, files and bytes under the paths that match the specified file pattern. The output columns are:<br/><code>DIR_COUNT, FILE_COUNT, CONTENT_SIZE FILE_NAME</code>. <br/><br/>The output columns with <code>-q</code> are:<br/><code>QUOTA, REMAINING_QUATA, SPACE_QUOTA, REMAINING_SPACE_QUOTA, DIR_COUNT, FILE_COUNT, CONTENT_SIZE, FILE_NAME</code>.
			
 
				-		   </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -count hdfs://nn1.example.com/file1 hdfs://nn2.example.com/file2 
			
 
				-		   </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -count -q hdfs://nn1.example.com/file1
			
 
				-		   </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> cp </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -cp URI [URI &#x2026;] &lt;dest&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Copy files from source to destination. This command allows multiple sources as well in which case the destination must be a directory.
			
 
				-	    <br/>
			
 
				-	    Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -cp /user/hadoop/file1 /user/hadoop/file2</code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -cp /user/hadoop/file1 /user/hadoop/file2 /user/hadoop/dir </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title>du</title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -du URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	     Displays aggregate length of  files contained in the directory or the length of a file in case its just a file.<br/>
			
 
				-	     Example:<br/><code>hadoop fs -du /user/hadoop/dir1 /user/hadoop/file1 hdfs://nn.example.com/user/hadoop/dir1</code><br/>
			
 
				-	     Exit Code:<br/><code> Returns 0 on success and -1 on error. </code><br/></p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> dus </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -dus &lt;args&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Displays a summary of file lengths.
			
 
				-	   </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> expunge </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -expunge</code>
			
 
				-			</p>
			
 
				-			<p>Empty the Trash. Refer to <a href="hdfs_design.html">HDFS Architecture</a> for more information on Trash feature.
			
 
				-	   </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> get </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -get [-ignorecrc] [-crc] &lt;src&gt; &lt;localdst&gt;</code>
			
 
				-				<br/>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Copy files to the local file system. Files that fail the CRC check may be copied with the  
			
 
				-	   <code>-ignorecrc</code> option. Files and CRCs may be copied using the 
			
 
				-	   <code>-crc</code> option.
			
 
				-	  </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -get /user/hadoop/file localfile </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -get hdfs://nn.example.com/user/hadoop/file localfile</code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error. </code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> getmerge </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -getmerge &lt;src&gt; &lt;localdst&gt; [addnl]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	  Takes a source directory and a destination file as input and concatenates files in src into the destination local file. Optionally <code>addnl</code> can be set to enable adding a newline character at the end of each file.  
			
 
				-	  </p>
			
 
				-		</section>
			
 
				-       <section>
			
 
				-           <title>ls</title>
			
 
				-           <p>
			
 
				-               <code>Usage: hadoop fs -ls &lt;args&gt;</code>
			
 
				-           </p>
			
 
				-           <p>For a file returns stat on the file with the following format:</p>
			
 
				-           <p>
			
 
				-               <code>permissions number_of_replicas userid  groupid  filesize modification_date modification_time filename</code>
			
 
				-           </p>
			
 
				-           <p>For a directory it returns list of its direct children as in unix.A directory is listed as:</p>
			
 
				-           <p>
			
 
				-               <code>permissions userid groupid modification_date modification_time dirname</code>
			
 
				-           </p>
			
 
				-           <p>Example:</p>
			
 
				-           <p>
			
 
				-               <code>hadoop fs -ls /user/hadoop/file1 </code>
			
 
				-           </p>
			
 
				-           <p>Exit Code:</p>
			
 
				-           <p>
			
 
				-               <code>Returns 0 on success and -1 on error.</code>
			
 
				-           </p>
			
 
				-       </section>
			
 
				-		<section>
			
 
				-			<title>lsr</title>
			
 
				-			<p><code>Usage: hadoop fs -lsr &lt;args&gt;</code><br/>
			
 
				-	      Recursive version of <code>ls</code>. Similar to Unix <code>ls -R</code>.
			
 
				-	      </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> mkdir </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -mkdir &lt;paths&gt;</code>
			
 
				-				<br/>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Takes path uri's as argument and creates directories. The behavior is much like unix mkdir -p creating parent directories along the path.
			
 
				-	  </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code>hadoop fs -mkdir /user/hadoop/dir1 /user/hadoop/dir2 </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code>hadoop fs -mkdir hdfs://nn1.example.com/user/hadoop/dir hdfs://nn2.example.com/user/hadoop/dir
			
 
				-	  </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code>Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> moveFromLocal </title>
			
 
				-			<p>
			
 
				-				<code>Usage: dfs -moveFromLocal &lt;localsrc&gt; &lt;dst&gt;</code>
			
 
				-			</p>
			
 
				-			<p>Similar to <a href="#put"><strong>put</strong></a> command, except that the source <code>localsrc</code> is deleted after it's copied. </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> moveToLocal</title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -moveToLocal [-crc] &lt;src&gt; &lt;dst&gt;</code>
			
 
				-			</p>
			
 
				-			<p>Displays a "Not implemented yet" message.</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> mv </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -mv URI [URI &#x2026;] &lt;dest&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Moves files from source to destination. This command allows multiple sources as well in which case the destination needs to be a directory. Moving files across filesystems is not permitted.
			
 
				-	    <br/>
			
 
				-	    Example:
			
 
				-	    </p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -mv /user/hadoop/file1 /user/hadoop/file2</code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -mv hdfs://nn.example.com/file1 hdfs://nn.example.com/file2 hdfs://nn.example.com/file3 hdfs://nn.example.com/dir1</code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> put </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -put &lt;localsrc&gt; ... &lt;dst&gt;</code>
			
 
				-			</p>
			
 
				-			<p>Copy single src, or multiple srcs from local file system to the destination filesystem. Also reads input from stdin and writes to destination filesystem.<br/>
			
 
				-	   </p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -put localfile /user/hadoop/hadoopfile</code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -put localfile1 localfile2 /user/hadoop/hadoopdir</code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -put localfile hdfs://nn.example.com/hadoop/hadoopfile</code>
			
 
				-				</li>
			
 
				-				<li><code>hadoop fs -put - hdfs://nn.example.com/hadoop/hadoopfile</code><br/>Reads the input from stdin.</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error. </code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> rm </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -rm URI [URI &#x2026;] </code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Delete files specified as args. Only deletes non empty directory and files. Refer to rmr for recursive deletes.<br/>
			
 
				-	   Example:
			
 
				-	   </p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -rm hdfs://nn.example.com/file /user/hadoop/emptydir </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> rmr </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -rmr URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>Recursive version of delete.<br/>
			
 
				-	   Example:
			
 
				-	   </p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -rmr /user/hadoop/dir </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -rmr hdfs://nn.example.com/user/hadoop/dir </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error. </code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> setrep </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -setrep [-R] &lt;path&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Changes the replication factor of a file. -R option is for recursively increasing the replication factor of files within a directory.
			
 
				-	  </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -setrep -w 3 -R /user/hadoop/dir1 </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code>Returns 0 on success and -1 on error. </code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> stat </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -stat URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Returns the stat information on the path.
			
 
				-	   </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -stat path </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:<br/>
			
 
				-	   <code> Returns 0 on success and -1 on error.</code></p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> tail </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -tail [-f] URI </code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Displays last kilobyte of the file to stdout. -f option can be used as in Unix.
			
 
				-	   </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -tail pathname </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code: <br/>
			
 
				-	   <code> Returns 0 on success and -1 on error.</code></p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> test </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -test -[ezd] URI</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Options: <br/>
			
 
				-	   -e check to see if the file exists. Return 0 if true. <br/>
			
 
				-	   -z check to see if the file is zero length. Return 0 if true. <br/>
			
 
				-	   -d check to see if the path is directory. Return 0 if true. <br/></p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop fs -test -e filename </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> text </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -text &lt;src&gt;</code>
			
 
				-				<br/>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Takes a source file and outputs the file in text format. The allowed formats are zip and TextRecordInputStream.
			
 
				-	  </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> touchz </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop fs -touchz URI [URI &#x2026;]</code>
			
 
				-				<br/>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Create a file of zero length.
			
 
				-	   </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop -touchz pathname </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:<br/>
			
 
				-	   <code> Returns 0 on success and -1 on error.</code></p>
			
 
				-		</section>
			
 
				-        </section>
			
 
				-	</body>
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml
+++ b/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml
@@ -24,7 +24,7 @@
 
				 
			
 
				   <header>
			
 
				     <title>
			
 
				-      HDFS User Guide
			
 
				+      HDFS Users Guide
			
 
				     </title>
			
 
				   </header>
			
 
				 
			
@@ -32,9 +32,8 @@
 
				     <section> <title>Purpose</title>
			
 
				       <p>
			
 
				  This document is a starting point for users working with
			
 
				- Hadoop Distributed File System (HDFS) either as a part of a
			
 
				- <a href="http://hadoop.apache.org/">Hadoop</a>
			
 
				- cluster or as a stand-alone general purpose distributed file system.
			
 
				+ Hadoop Distributed File System (HDFS) either as a part of a Hadoop cluster  
			
 
				+ or as a stand-alone general purpose distributed file system.
			
 
				  While HDFS is designed to "just work" in many environments, a working
			
 
				  knowledge of HDFS helps greatly with configuration improvements and
			
 
				  diagnostics on a specific cluster.
			
@@ -46,7 +45,7 @@
 
				  HDFS is the primary distributed storage used by Hadoop applications. A
			
 
				  HDFS cluster primarily consists of a NameNode that manages the
			
 
				  file system metadata and DataNodes that store the actual data. The
			
 
				- <a href="hdfs_design.html">HDFS Architecture</a> describes HDFS in detail. This user guide primarily deals with 
			
 
				+ <a href="hdfs_design.html">HDFS Architecture Guide</a> describes HDFS in detail. This user guide primarily deals with 
			
 
				  the interaction of users and administrators with HDFS clusters. 
			
 
				  The <a href="images/hdfsarchitecture.gif">HDFS architecture diagram</a> depicts 
			
 
				  basic interactions among NameNode, the DataNodes, and the clients. 
			
@@ -61,8 +60,7 @@
 
				     <li>
			
 
				     	Hadoop, including HDFS, is well suited for distributed storage
			
 
				     	and distributed processing using commodity hardware. It is fault
			
 
				-    	tolerant, scalable, and extremely simple to expand.
			
 
				-    	<a href="mapred_tutorial.html">Map/Reduce</a>,
			
 
				+    	tolerant, scalable, and extremely simple to expand. MapReduce, 
			
 
				     	well known for its simplicity and applicability for large set of
			
 
				     	distributed applications, is an integral part of Hadoop.
			
 
				     </li>
			
@@ -134,18 +132,17 @@
 
				     </li>
			
 
				     </ul>
			
 
				     
			
 
				-    </section> <section> <title> Pre-requisites </title>
			
 
				+    </section> <section> <title> Prerequisites </title>
			
 
				     <p>
			
 
				- 	The following documents describe installation and set up of a
			
 
				- 	Hadoop cluster : 
			
 
				+ 	The following documents describe how to install and set up a Hadoop cluster: 
			
 
				     </p>
			
 
				  	<ul>
			
 
				  	<li>
			
 
				- 		<a href="quickstart.html">Hadoop Quick Start</a>
			
 
				+ 		<a href="http://hadoop.apache.org/common/docs/current/single_node_setup.html">Single Node Setup</a>
			
 
				  		for first-time users.
			
 
				  	</li>
			
 
				  	<li>
			
 
				- 		<a href="cluster_setup.html">Hadoop Cluster Setup</a>
			
 
				+ 		<a href="http://hadoop.apache.org/common/docs/current/cluster_setup.html">Cluster Setup</a>
			
 
				  		for large, distributed clusters.
			
 
				  	</li>
			
 
				     </ul>
			
@@ -173,14 +170,15 @@
 
				       Hadoop includes various shell-like commands that directly
			
 
				       interact with HDFS and other file systems that Hadoop supports.
			
 
				       The command
			
 
				-      <code>bin/hadoop fs -help</code>
			
 
				+      <code>bin/hdfs dfs -help</code>
			
 
				       lists the commands supported by Hadoop
			
 
				       shell. Furthermore, the command
			
 
				-      <code>bin/hadoop fs -help command-name</code>
			
 
				+      <code>bin/hdfs dfs -help command-name</code>
			
 
				       displays more detailed help for a command. These commands support
			
 
				-      most of the normal files ystem operations like copying files,
			
 
				+      most of the normal files system operations like copying files,
			
 
				       changing file permissions, etc. It also supports a few HDFS
			
 
				-      specific operations like changing replication of files.
			
 
				+      specific operations like changing replication of files. 
			
 
				+      For more information see <a href="http://hadoop.apache.org/common/docs/current/file_system_shell.html">File System Shell Guide</a>.
			
 
				      </p>
			
 
				 
			
 
				    <section> <title> DFSAdmin Command </title>
			
@@ -223,17 +221,19 @@
 
				     </li>
			
 
				    	</ul>
			
 
				    	<p>
			
 
				-   	  For command usage, see <a href="commands_manual.html#dfsadmin">dfsadmin command</a>.
			
 
				+   	  For command usage, see  
			
 
				+   	  <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#dfsadmin">dfsadmin</a>.
			
 
				    	</p>  
			
 
				    </section>
			
 
				    
			
 
				    </section> 
			
 
				 	<section> <title>Secondary NameNode</title>
			
 
				-   <p>
			
 
				-     The Secondary NameNode has been deprecated; considering using the 
			
 
				-   <a href="hdfs_user_guide.html#Checkpoint+node">Checkpoint node</a> or 
			
 
				-   <a href="hdfs_user_guide.html#Backup+node">Backup node</a> instead.
			
 
				-   </p>
			
 
				+   <note>
			
 
				+   The Secondary NameNode has been deprecated. 
			
 
				+   Instead, consider using the 
			
 
				+   <a href="hdfs_user_guide.html#Checkpoint+node">Checkpoint Node</a> or 
			
 
				+   <a href="hdfs_user_guide.html#Backup+node">Backup Node</a>.
			
 
				+   </note>
			
 
				    <p>	
			
 
				      The NameNode stores modifications to the file system as a log
			
 
				      appended to a native file system file, <code>edits</code>. 
			
@@ -277,10 +277,11 @@
 
				      read by the primary NameNode if necessary.
			
 
				    </p>
			
 
				    <p>
			
 
				-     For command usage, see <a href="commands_manual.html#secondarynamenode"><code>secondarynamenode</code> command</a>.
			
 
				+     For command usage, see  
			
 
				+     <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#secondarynamenode">secondarynamenode</a>.
			
 
				    </p>
			
 
				    
			
 
				-   </section><section> <title> Checkpoint node </title>
			
 
				+   </section><section> <title> Checkpoint Node </title>
			
 
				    <p>NameNode persists its namespace using two files: <code>fsimage</code>,
			
 
				       which is the latest checkpoint of the namespace and <code>edits</code>,
			
 
				       a journal (log) of changes to the namespace since the checkpoint.
			
@@ -329,17 +330,17 @@
 
				    </p>
			
 
				    <p>Multiple checkpoint nodes may be specified in the cluster configuration file.</p>
			
 
				    <p>
			
 
				-     For command usage, see
			
 
				-     <a href="commands_manual.html#namenode"><code>namenode</code> command</a>.
			
 
				+     For command usage, see  
			
 
				+     <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#namenode">namenode</a>.
			
 
				    </p>
			
 
				    </section>
			
 
				 
			
 
				-   <section> <title> Backup node </title>
			
 
				+   <section> <title> Backup Node </title>
			
 
				    <p>	
			
 
				     The Backup node provides the same checkpointing functionality as the 
			
 
				     Checkpoint node, as well as maintaining an in-memory, up-to-date copy of the
			
 
				     file system namespace that is always synchronized with the active NameNode state.
			
 
				-    Along with accepting a journal stream of filesystem edits from 
			
 
				+    Along with accepting a journal stream of file system edits from 
			
 
				     the NameNode and persisting this to disk, the Backup node also applies 
			
 
				     those edits into its own copy of the namespace in memory, thus creating 
			
 
				     a backup of the namespace.
			
@@ -384,12 +385,12 @@
 
				     For a complete discussion of the motivation behind the creation of the 
			
 
				     Backup node and Checkpoint node, see 
			
 
				     <a href="https://issues.apache.org/jira/browse/HADOOP-4539">HADOOP-4539</a>.
			
 
				-    For command usage, see 
			
 
				-    <a href="commands_manual.html#namenode"><code>namenode</code> command</a>.
			
 
				+    For command usage, see  
			
 
				+     <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#namenode">namenode</a>.
			
 
				    </p>
			
 
				    </section>
			
 
				 
			
 
				-   <section> <title> Import checkpoint </title>
			
 
				+   <section> <title> Import Checkpoint </title>
			
 
				    <p>
			
 
				      The latest checkpoint can be imported to the NameNode if
			
 
				      all other copies of the image and the edits files are lost.
			
@@ -418,8 +419,8 @@
 
				      consistent, but does not modify it in any way.
			
 
				    </p>
			
 
				    <p>
			
 
				-     For command usage, see
			
 
				-     <a href="commands_manual.html#namenode"><code>namenode</code> command</a>.
			
 
				+     For command usage, see  
			
 
				+      <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#namenode">namenode</a>.
			
 
				    </p>
			
 
				    </section>
			
 
				 
			
@@ -461,7 +462,8 @@
 
				       <a href="http://issues.apache.org/jira/browse/HADOOP-1652">HADOOP-1652</a>.
			
 
				     </p>
			
 
				     <p>
			
 
				-     For command usage, see <a href="commands_manual.html#balancer">balancer command</a>.
			
 
				+     For command usage, see  
			
 
				+     <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#balancer">balancer</a>.
			
 
				    </p>
			
 
				     
			
 
				    </section> <section> <title> Rack Awareness </title>
			
@@ -512,7 +514,8 @@
 
				       <code>fsck</code> ignores open files but provides an option to select all files during reporting.
			
 
				       The HDFS <code>fsck</code> command is not a
			
 
				       Hadoop shell command. It can be run as '<code>bin/hadoop fsck</code>'.
			
 
				-      For command usage, see <a href="commands_manual.html#fsck"><code>fsck</code> command</a>. 
			
 
				+      For command usage, see  
			
 
				+      <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#fsck">fsck</a>.
			
 
				       <code>fsck</code> can be run on the whole file system or on a subset of files.
			
 
				      </p>
			
 
				      
			
@@ -527,7 +530,7 @@
 
				       of Hadoop and rollback the cluster to the state it was in 
			
 
				       before
			
 
				       the upgrade. HDFS upgrade is described in more detail in 
			
 
				-      <a href="http://wiki.apache.org/hadoop/Hadoop%20Upgrade">upgrade wiki</a>.
			
 
				+      <a href="http://wiki.apache.org/hadoop/Hadoop%20Upgrade">Hadoop Upgrade</a> Wiki page.
			
 
				       HDFS can have one such backup at a time. Before upgrading,
			
 
				       administrators need to remove existing backup using <code>bin/hadoop
			
 
				       dfsadmin -finalizeUpgrade</code> command. The following
			
@@ -571,13 +574,13 @@
 
				       treated as the superuser for HDFS. Future versions of HDFS will
			
 
				       support network authentication protocols like Kerberos for user
			
 
				       authentication and encryption of data transfers. The details are discussed in the 
			
 
				-      <a href="hdfs_permissions_guide.html">HDFS Admin Guide: Permissions</a>.
			
 
				+      <a href="hdfs_permissions_guide.html">Permissions Guide</a>.
			
 
				      </p>
			
 
				      
			
 
				    </section> <section> <title> Scalability </title>
			
 
				      <p>
			
 
				-      Hadoop currently runs on clusters with thousands of nodes.
			
 
				-      <a href="http://wiki.apache.org/hadoop/PoweredBy">Powered By Hadoop</a>
			
 
				+      Hadoop currently runs on clusters with thousands of nodes. The  
			
 
				+      <a href="http://wiki.apache.org/hadoop/PoweredBy">PoweredBy</a> Wiki page 
			
 
				       lists some of the organizations that deploy Hadoop on large
			
 
				       clusters. HDFS has one NameNode for each cluster. Currently
			
 
				       the total memory available on NameNode is the primary scalability
			
@@ -585,8 +588,8 @@
 
				       files stored in HDFS helps with increasing cluster size without
			
 
				       increasing memory requirements on NameNode.
			
 
				    
			
 
				-      The default configuration may not suite very large clustes.
			
 
				-      <a href="http://wiki.apache.org/hadoop/FAQ">Hadoop FAQ</a> page lists
			
 
				+      The default configuration may not suite very large clustes. The 
			
 
				+      <a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a> Wiki page lists
			
 
				       suggested configuration improvements for large Hadoop clusters.
			
 
				      </p>
			
 
				      
			
@@ -599,15 +602,16 @@
 
				       </p>
			
 
				       <ul>
			
 
				       <li>
			
 
				-        <a href="http://hadoop.apache.org/">Hadoop Home Page</a>: The start page for everything Hadoop.
			
 
				+        <a href="http://hadoop.apache.org/">Hadoop Site</a>: The home page for the Apache Hadoop site.
			
 
				       </li>
			
 
				       <li>
			
 
				-        <a href="http://wiki.apache.org/hadoop/FrontPage">Hadoop Wiki</a>
			
 
				-        : Front page for Hadoop Wiki documentation. Unlike this
			
 
				-        guide which is part of Hadoop source tree, Hadoop Wiki is
			
 
				+        <a href="http://wiki.apache.org/hadoop/FrontPage">Hadoop Wiki</a>:
			
 
				+        The home page (FrontPage) for the Hadoop Wiki. Unlike the released documentation, 
			
 
				+        which is part of Hadoop source tree, Hadoop Wiki is
			
 
				         regularly edited by Hadoop Community.
			
 
				       </li>
			
 
				-      <li> <a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a> from Hadoop Wiki.
			
 
				+      <li> <a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a>: 
			
 
				+      The FAQ Wiki page.
			
 
				       </li>
			
 
				       <li>
			
 
				         Hadoop <a href="http://hadoop.apache.org/core/docs/current/api/">
			
@@ -623,7 +627,7 @@
 
				          description of most of the configuration variables available.
			
 
				       </li>
			
 
				       <li>
			
 
				-        <a href="commands_manual.html">Hadoop Command Guide</a>: commands usage.
			
 
				+        <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html">Hadoop Commands Guide</a>: Hadoop commands usage.
			
 
				       </li>
			
 
				       </ul>
			
 
				      </section>
			
--- a/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml
+++ b/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml
@@ -1,412 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
			
 
				-          "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-
			
 
				-<document>
			
 
				-
			
 
				-  <header>
			
 
				-    <title> 
			
 
				-      HOD Administrator Guide
			
 
				-    </title>
			
 
				-  </header>
			
 
				-
			
 
				-  <body>
			
 
				-<section>
			
 
				-<title>Overview</title>
			
 
				-<p>Hadoop On Demand (HOD) is a system for provisioning and
			
 
				-managing independent Hadoop Map/Reduce and Hadoop Distributed File System (HDFS)
			
 
				-instances on a shared cluster 
			
 
				-of nodes. HOD is a tool that makes it easy for administrators and users to 
			
 
				-quickly setup and use Hadoop. HOD is also a very useful tool for Hadoop developers 
			
 
				-and testers who need to share a physical cluster for testing their own Hadoop 
			
 
				-versions.
			
 
				-</p>
			
 
				-
			
 
				-<p>HOD relies on a resource manager (RM) for allocation of nodes that it can use for
			
 
				-running Hadoop instances. At present it runs with the <a href="ext:hod/torque">Torque
			
 
				-resource manager</a>.
			
 
				-</p>
			
 
				-
			
 
				-<p>
			
 
				-The basic system architecture of HOD includes these components:</p>
			
 
				-<ul>
			
 
				-  <li>A Resource manager (possibly together with a scheduler)</li>
			
 
				-  <li>Various HOD components</li>
			
 
				-  <li>Hadoop Map/Reduce and HDFS daemons</li>
			
 
				-</ul>
			
 
				-
			
 
				-<p>
			
 
				-HOD provisions and maintains Hadoop Map/Reduce and, optionally, HDFS instances 
			
 
				-through interaction with the above components on a given cluster of nodes. A cluster of
			
 
				-nodes can be thought of as comprising two sets of nodes:</p>
			
 
				-<ul>
			
 
				-  <li>Submit nodes: Users use the HOD client on these nodes to allocate clusters, and then
			
 
				-use the Hadoop client to submit Hadoop jobs. </li>
			
 
				-  <li>Compute nodes: Using the resource manager, HOD components are run on these nodes to 
			
 
				-provision the Hadoop daemons. After that Hadoop jobs run on them.</li>
			
 
				-</ul>
			
 
				-
			
 
				-<p>
			
 
				-Here is a brief description of the sequence of operations in allocating a cluster and
			
 
				-running jobs on them.
			
 
				-</p>
			
 
				-
			
 
				-<ul>
			
 
				-  <li>The user uses the HOD client on the Submit node to allocate a desired number of
			
 
				-cluster nodes and to provision Hadoop on them.</li>
			
 
				-  <li>The HOD client uses a resource manager interface (qsub, in Torque) to submit a HOD
			
 
				-process, called the RingMaster, as a Resource Manager job, to request the user's desired number 
			
 
				-of nodes. This job is submitted to the central server of the resource manager (pbs_server, in Torque).</li>
			
 
				-  <li>On the compute nodes, the resource manager slave daemons (pbs_moms in Torque) accept
			
 
				-and run jobs that they are assigned by the central server (pbs_server in Torque). The RingMaster 
			
 
				-process is started on one of the compute nodes (mother superior, in Torque).</li>
			
 
				-  <li>The RingMaster then uses another resource manager interface (pbsdsh, in Torque) to run
			
 
				-the second HOD component, HodRing, as distributed tasks on each of the compute
			
 
				-nodes allocated.</li>
			
 
				-  <li>The HodRings, after initializing, communicate with the RingMaster to get Hadoop commands, 
			
 
				-and run them accordingly. Once the Hadoop commands are started, they register with the RingMaster,
			
 
				-giving information about the daemons.</li>
			
 
				-  <li>All the configuration files needed for Hadoop instances are generated by HOD itself, 
			
 
				-some obtained from options given by user in its own configuration file.</li>
			
 
				-  <li>The HOD client keeps communicating with the RingMaster to find out the location of the 
			
 
				-JobTracker and HDFS daemons.</li>
			
 
				-</ul>
			
 
				-
			
 
				-<p>This guide shows you how to get started using HOD, reviews various HOD features and command line options, and provides detailed troubleshooting help.</p>
			
 
				-
			
 
				-</section>
			
 
				-
			
 
				-<section>
			
 
				-<title>Pre-requisites</title>
			
 
				-<p>To use HOD, your system should include the following hardware and software
			
 
				-components.</p>
			
 
				-<p>Operating System: HOD is currently tested on RHEL4.<br/>
			
 
				-Nodes : HOD requires a minimum of three nodes configured through a resource manager.<br/></p>
			
 
				-
			
 
				-<p> Software </p>
			
 
				-<p>The following components must be installed on ALL nodes before using HOD:</p>
			
 
				-<ul>
			
 
				- <li><a href="ext:hod/torque">Torque: Resource manager</a></li>
			
 
				- <li><a href="ext:hod/python">Python</a> : HOD requires version 2.5.1 of Python.</li>
			
 
				-</ul>
			
 
				-
			
 
				-<p>The following components are optional and can be installed to obtain better
			
 
				-functionality from HOD:</p>
			
 
				-<ul>
			
 
				- <li><a href="ext:hod/twisted-python">Twisted Python</a>: This can be
			
 
				-  used for improving the scalability of HOD. If this module is detected to be
			
 
				-  installed, HOD uses it, else it falls back to default modules.</li>
			
 
				- <li><a href="ext:site">Hadoop</a>: HOD can automatically
			
 
				- distribute Hadoop to all nodes in the cluster. However, it can also use a
			
 
				- pre-installed version of Hadoop, if it is available on all nodes in the cluster.
			
 
				-  HOD currently supports Hadoop 0.15 and above.</li>
			
 
				-</ul>
			
 
				-
			
 
				-<p>NOTE: HOD configuration requires the location of installs of these
			
 
				-components to be the same on all nodes in the cluster. It will also
			
 
				-make the configuration simpler to have the same location on the submit
			
 
				-nodes.
			
 
				-</p>
			
 
				-</section>
			
 
				-
			
 
				-<section>
			
 
				-<title>Resource Manager</title>
			
 
				-<p>  Currently HOD works with the Torque resource manager, which it uses for its node
			
 
				-  allocation and job submission. Torque is an open source resource manager from
			
 
				-  <a href="ext:hod/cluster-resources">Cluster Resources</a>, a community effort
			
 
				-  based on the PBS project. It provides control over batch jobs and distributed compute nodes. Torque is
			
 
				-  freely available for download from <a href="ext:hod/torque-download">here</a>.
			
 
				-  </p>
			
 
				-
			
 
				-<p>  All documentation related to torque can be seen under
			
 
				-  the section TORQUE Resource Manager <a
			
 
				-  href="ext:hod/torque-docs">here</a>. You can
			
 
				-  get wiki documentation from <a
			
 
				-  href="ext:hod/torque-wiki">here</a>.
			
 
				-  Users may wish to subscribe to TORQUE’s mailing list or view the archive for questions,
			
 
				-  comments <a
			
 
				-  href="ext:hod/torque-mailing-list">here</a>.
			
 
				-</p>
			
 
				-
			
 
				-<p>To use HOD with Torque:</p>
			
 
				-<ul>
			
 
				- <li>Install Torque components: pbs_server on one node (head node), pbs_mom on all
			
 
				-  compute nodes, and PBS client tools on all compute nodes and submit
			
 
				-  nodes. Perform at least a basic configuration so that the Torque system is up and
			
 
				-  running, that is, pbs_server knows which machines to talk to. Look <a
			
 
				-  href="ext:hod/torque-basic-config">here</a>
			
 
				-  for basic configuration.
			
 
				-
			
 
				-  For advanced configuration, see <a
			
 
				-  href="ext:hod/torque-advanced-config">here</a></li>
			
 
				- <li>Create a queue for submitting jobs on the pbs_server. The name of the queue is the
			
 
				-  same as the HOD configuration parameter, resource-manager.queue. The HOD client uses this queue to
			
 
				-  submit the RingMaster process as a Torque job.</li>
			
 
				- <li>Specify a cluster name as a property for all nodes in the cluster.
			
 
				-  This can be done by using the qmgr command. For example:
			
 
				-  <code>qmgr -c "set node node properties=cluster-name"</code>. The name of the cluster is the same as
			
 
				-  the HOD configuration parameter, hod.cluster. </li>
			
 
				- <li>Make sure that jobs can be submitted to the nodes. This can be done by
			
 
				-  using the qsub command. For example:
			
 
				-  <code>echo "sleep 30" | qsub -l nodes=3</code></li>
			
 
				-</ul>
			
 
				-
			
 
				-</section>
			
 
				-
			
 
				-<section>
			
 
				-<title>Installing HOD</title>
			
 
				-
			
 
				-<p>Once the resource manager is set up, you can obtain and
			
 
				-install HOD.</p>
			
 
				-<ul>
			
 
				- <li>If you are getting HOD from the Hadoop tarball, it is available under the 
			
 
				-  'contrib' section of Hadoop, under the root  directory 'hod'.</li>
			
 
				- <li>If you are building from source, you can run ant tar from the Hadoop root
			
 
				-  directory to generate the Hadoop tarball, and then get HOD from there,
			
 
				-  as described above.</li>
			
 
				- <li>Distribute the files under this directory to all the nodes in the
			
 
				-  cluster. Note that the location where the files are copied should be
			
 
				-  the same on all the nodes.</li>
			
 
				-  <li>Note that compiling hadoop would build HOD with appropriate permissions 
			
 
				-  set on all the required script files in HOD.</li>
			
 
				-</ul>
			
 
				-</section>
			
 
				-
			
 
				-<section>
			
 
				-<title>Configuring HOD</title>
			
 
				-
			
 
				-<p>You can configure HOD once it is installed. The minimal configuration needed
			
 
				-to run HOD is described below. More advanced configuration options are discussed
			
 
				-in the HOD Configuration Guide.</p>
			
 
				-<section>
			
 
				-  <title>Minimal Configuration</title>
			
 
				-  <p>To get started using HOD, the following minimal configuration is
			
 
				-  required:</p>
			
 
				-<ul>
			
 
				- <li>On the node from where you want to run HOD, edit the file hodrc
			
 
				-  located in the &lt;install dir&gt;/conf directory. This file
			
 
				-  contains the minimal set of values required to run hod.</li>
			
 
				- <li>
			
 
				-<p>Specify values suitable to your environment for the following
			
 
				-  variables defined in the configuration file. Note that some of these
			
 
				-  variables are defined at more than one place in the file.</p>
			
 
				-
			
 
				-  <ul>
			
 
				-   <li>${JAVA_HOME}: Location of Java for Hadoop. Hadoop supports Sun JDK
			
 
				-    1.6.x and above.</li>
			
 
				-   <li>${CLUSTER_NAME}: Name of the cluster which is specified in the
			
 
				-    'node property' as mentioned in resource manager configuration.</li>
			
 
				-   <li>${HADOOP_HOME}: Location of Hadoop installation on the compute and
			
 
				-    submit nodes.</li>
			
 
				-   <li>${RM_QUEUE}: Queue configured for submitting jobs in the resource
			
 
				-    manager configuration.</li>
			
 
				-   <li>${RM_HOME}: Location of the resource manager installation on the
			
 
				-    compute and submit nodes.</li>
			
 
				-    </ul>
			
 
				-</li>
			
 
				-
			
 
				-<li>
			
 
				-<p>The following environment variables may need to be set depending on
			
 
				-  your environment. These variables must be defined where you run the
			
 
				-  HOD client and must also be specified in the HOD configuration file as the
			
 
				-  value of the key resource_manager.env-vars. Multiple variables can be
			
 
				-  specified as a comma separated list of key=value pairs.</p>
			
 
				-
			
 
				-  <ul>
			
 
				-   <li>HOD_PYTHON_HOME: If you install python to a non-default location
			
 
				-    of the compute nodes, or submit nodes, then this variable must be
			
 
				-    defined to point to the python executable in the non-standard
			
 
				-    location.</li>
			
 
				-    </ul>
			
 
				-</li>
			
 
				-</ul>
			
 
				-</section>
			
 
				-
			
 
				-  <section>
			
 
				-    <title>Advanced Configuration</title>
			
 
				-    <p> You can review and modify other configuration options to suit
			
 
				- your specific needs. Refer to the <a href="hod_config_guide.html">HOD Configuration
			
 
				- Guide</a> for more information.</p>
			
 
				-  </section>
			
 
				-</section>
			
 
				-
			
 
				-  <section>
			
 
				-    <title>Running HOD</title>
			
 
				-    <p>You can run HOD once it is configured. Refer to the<a
			
 
				-    href="hod_user_guide.html"> HOD User Guide</a> for more information.</p>
			
 
				-  </section>
			
 
				-
			
 
				-  <section>
			
 
				-    <title>Supporting Tools and Utilities</title>
			
 
				-    <p>This section describes supporting tools and utilities that can be used to
			
 
				-    manage HOD deployments.</p>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>logcondense.py - Manage Log Files</title>
			
 
				-      <p>As mentioned in the 
			
 
				-         <a href="hod_user_guide.html#Collecting+and+Viewing+Hadoop+Logs">HOD User Guide</a>,
			
 
				-         HOD can be configured to upload
			
 
				-         Hadoop logs to a statically configured HDFS. Over time, the number of logs uploaded
			
 
				-         to HDFS could increase. logcondense.py is a tool that helps
			
 
				-         administrators to remove log files uploaded to HDFS. </p>
			
 
				-      <section>
			
 
				-        <title>Running logcondense.py</title>
			
 
				-        <p>logcondense.py is available under hod_install_location/support folder. You can either
			
 
				-        run it using python, for example, <em>python logcondense.py</em>, or give execute permissions 
			
 
				-        to the file, and directly run it as <em>logcondense.py</em>. logcondense.py needs to be 
			
 
				-        run by a user who has sufficient permissions to remove files from locations where log 
			
 
				-        files are uploaded in the HDFS, if permissions are enabled. For example as mentioned in the
			
 
				-        <a href="hod_config_guide.html#3.7+hodring+options">HOD Configuration Guide</a>, the logs could
			
 
				-        be configured to come under the user's home directory in HDFS. In that case, the user
			
 
				-        running logcondense.py should have super user privileges to remove the files from under
			
 
				-        all user home directories.</p>
			
 
				-      </section>
			
 
				-      <section>
			
 
				-        <title>Command Line Options for logcondense.py</title>
			
 
				-        <p>The following command line options are supported for logcondense.py.</p>
			
 
				-          <table>
			
 
				-            <tr>
			
 
				-              <td>Short Option</td>
			
 
				-              <td>Long option</td>
			
 
				-              <td>Meaning</td>
			
 
				-              <td>Example</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-              <td>-p</td>
			
 
				-              <td>--package</td>
			
 
				-              <td>Complete path to the hadoop script. The version of hadoop must be the same as the 
			
 
				-                  one running HDFS.</td>
			
 
				-              <td>/usr/bin/hadoop</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-              <td>-d</td>
			
 
				-              <td>--days</td>
			
 
				-              <td>Delete log files older than the specified number of days</td>
			
 
				-              <td>7</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-              <td>-c</td>
			
 
				-              <td>--config</td>
			
 
				-              <td>Path to the Hadoop configuration directory, under which hadoop-site.xml resides.
			
 
				-              The hadoop-site.xml must point to the HDFS NameNode from where logs are to be removed.</td>
			
 
				-              <td>/home/foo/hadoop/conf</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-              <td>-l</td>
			
 
				-              <td>--logs</td>
			
 
				-              <td>A HDFS path, this must be the same HDFS path as specified for the log-destination-uri,
			
 
				-              as mentioned in the  <a href="hod_config_guide.html#3.7+hodring+options">HOD Configuration Guide</a>,
			
 
				-              without the hdfs:// URI string</td>
			
 
				-              <td>/user</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-              <td>-n</td>
			
 
				-              <td>--dynamicdfs</td>
			
 
				-              <td>If true, this will indicate that the logcondense.py script should delete HDFS logs
			
 
				-              in addition to Map/Reduce logs. Otherwise, it only deletes Map/Reduce logs, which is also the
			
 
				-              default if this option is not specified. This option is useful if
			
 
				-              dynamic HDFS installations 
			
 
				-              are being provisioned by HOD, and the static HDFS installation is being used only to collect 
			
 
				-              logs - a scenario that may be common in test clusters.</td>
			
 
				-              <td>false</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-              <td>-r</td>
			
 
				-              <td>--retain-master-logs</td>
			
 
				-              <td>If true, this will keep the JobTracker logs of job in hod-logs inside HDFS and it 
			
 
				-              will delete only the TaskTracker logs. Also, this will keep the Namenode logs along with 
			
 
				-              JobTracker logs and will only delete the Datanode logs if 'dynamicdfs' options is set 
			
 
				-              to true. Otherwise, it will delete the complete job directory from hod-logs inside 
			
 
				-              HDFS. By default it is set to false.</td>
			
 
				-              <td>false</td>
			
 
				-            </tr>
			
 
				-          </table>
			
 
				-        <p>So, for example, to delete all log files older than 7 days using a hadoop-site.xml stored in
			
 
				-        ~/hadoop-conf, using the hadoop installation under ~/hadoop-0.17.0, you could say:</p>
			
 
				-        <p><em>python logcondense.py -p ~/hadoop-0.17.0/bin/hadoop -d 7 -c ~/hadoop-conf -l /user</em></p>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-    <section>
			
 
				-      <title>checklimits.sh - Monitor Resource Limits</title>
			
 
				-      <p>checklimits.sh is a HOD tool specific to the Torque/Maui environment
			
 
				-      (<a href="ext:hod/maui">Maui Cluster Scheduler</a> is an open source job
			
 
				-      scheduler for clusters and supercomputers, from clusterresources). The
			
 
				-      checklimits.sh script
			
 
				-      updates the torque comment field when newly submitted job(s) violate or
			
 
				-      exceed
			
 
				-      over user limits set up in Maui scheduler. It uses qstat, does one pass
			
 
				-      over the torque job-list to determine queued or unfinished jobs, runs Maui
			
 
				-      tool checkjob on each job to see if user limits are violated and then
			
 
				-      runs torque's qalter utility to update job attribute 'comment'. Currently
			
 
				-      it updates the comment as <em>User-limits exceeded. Requested:([0-9]*)
			
 
				-      Used:([0-9]*) MaxLimit:([0-9]*)</em> for those jobs that violate limits.
			
 
				-      This comment field is then used by HOD to behave accordingly depending on
			
 
				-      the type of violation.</p>
			
 
				-      <section>
			
 
				-        <title>Running checklimits.sh</title>
			
 
				-        <p>checklimits.sh is available under the hod_install_location/support
			
 
				-        folder. This shell script can be run directly as <em>sh
			
 
				-        checklimits.sh </em>or as <em>./checklimits.sh</em> after enabling
			
 
				-        execute permissions. Torque and Maui binaries should be available
			
 
				-        on the machine where the tool is run and should be in the path
			
 
				-        of the shell script process. To update the
			
 
				-        comment field of jobs from different users, this tool must be run with
			
 
				-        torque administrative privileges. This tool must be run repeatedly
			
 
				-        after specific intervals of time to frequently update jobs violating
			
 
				-        constraints, for example via cron. Please note that the resource manager
			
 
				-        and scheduler commands used in this script can be expensive and so
			
 
				-        it is better not to run this inside a tight loop without sleeping.</p>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>verify-account - Script to verify an account under which 
			
 
				-             jobs are submitted</title>
			
 
				-      <p>Production systems use accounting packages to charge users for using
			
 
				-      shared compute resources. HOD supports a parameter 
			
 
				-      <em>resource_manager.pbs-account</em> to allow users to identify the
			
 
				-      account under which they would like to submit jobs. It may be necessary
			
 
				-      to verify that this account is a valid one configured in an accounting
			
 
				-      system. The <em>hod-install-dir/bin/verify-account</em> script 
			
 
				-      provides a mechanism to plug-in a custom script that can do this
			
 
				-      verification.</p>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Integrating the verify-account script with HOD</title>
			
 
				-        <p>HOD runs the <em>verify-account</em> script passing in the
			
 
				-        <em>resource_manager.pbs-account</em> value as argument to the script,
			
 
				-        before allocating a cluster. Sites can write a script that verify this 
			
 
				-        account against their accounting systems. Returning a non-zero exit 
			
 
				-        code from this script will cause HOD to fail allocation. Also, in
			
 
				-        case of an error, HOD will print the output of script to the user.
			
 
				-        Any descriptive error message can be passed to the user from the
			
 
				-        script in this manner.</p>
			
 
				-        <p>The default script that comes with the HOD installation does not
			
 
				-        do any validation, and returns a zero exit code.</p>
			
 
				-        <p>If the verify-account script is not found, then HOD will treat
			
 
				-        that verification is disabled, and continue allocation as is.</p>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-
			
 
				-  </section>
			
 
				-
			
 
				-</body>
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/hod_config_guide.xml
+++ b/src/docs/src/documentation/content/xdocs/hod_config_guide.xml
@@ -1,344 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
			
 
				-          "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-
			
 
				-<document>
			
 
				-
			
 
				-  <header>
			
 
				-    <title> 
			
 
				-      HOD Configuration Guide
			
 
				-    </title>
			
 
				-  </header>
			
 
				-
			
 
				-  <body>
			
 
				-    <section>
			
 
				-      <title>1. Introduction</title>
			
 
				-      <p>This guide discusses Hadoop on Demand (HOD) configuration sections and shows you how to work with the most important 
			
 
				-      and commonly used HOD configuration options.</p>
			
 
				-      <p>Configuration options 
			
 
				-      can be specified in two ways: a configuration file 
			
 
				-      in the INI format, and as command line options to the HOD shell, 
			
 
				-      specified in the format --section.option[=value]. If the same option is 
			
 
				-      specified in both places, the value specified on the command line 
			
 
				-      overrides the value in the configuration file.</p>
			
 
				-      
			
 
				-      <p>
			
 
				-        To get a simple description of all configuration options, type:
			
 
				-      </p>
			
 
				-      <table><tr><td><code>$ hod --verbose-help</code></td></tr></table>
			
 
				-      
			
 
				-
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>2. Sections</title>
			
 
				-    
			
 
				-      <p>HOD organizes configuration options into these sections:</p>
			
 
				-      
			
 
				-      <ul>
			
 
				-        <li>  hod:                  Options for the HOD client</li>
			
 
				-        <li>  resource_manager:     Options for specifying which resource manager
			
 
				-         to use, and other parameters for using that resource manager</li>
			
 
				-        <li>  ringmaster:           Options for the RingMaster process, </li>
			
 
				-        <li>  hodring:              Options for the HodRing processes</li>
			
 
				-        <li>  gridservice-mapred:   Options for the Map/Reduce daemons</li>
			
 
				-        <li>  gridservice-hdfs:     Options for the HDFS daemons.</li>
			
 
				-      </ul>
			
 
				-    
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>3. HOD Configuration Options</title>
			
 
				-  
			
 
				-      <p>The following section describes configuration options common to most 
			
 
				-      HOD sections followed by sections that describe configuration options 
			
 
				-      specific to each HOD section.</p>
			
 
				-      
			
 
				-      <section> 
			
 
				-        <title>3.1 Common configuration options</title>
			
 
				-        
			
 
				-        <p>Certain configuration options are defined in most of the sections of 
			
 
				-        the HOD configuration. Options defined in a section, are used by the
			
 
				-        process for which that section applies. These options have the same
			
 
				-        meaning, but can have different values in each section.
			
 
				-        </p>
			
 
				-        
			
 
				-        <ul>
			
 
				-          <li>temp-dir: Temporary directory for usage by the HOD processes. Make 
			
 
				-                      sure that the users who will run hod have rights to create 
			
 
				-                      directories under the directory specified here. If you
			
 
				-                      wish to make this directory vary across allocations,
			
 
				-                      you can make use of the environmental variables which will
			
 
				-                      be made available by the resource manager to the HOD
			
 
				-                      processes. For example, in a Torque setup, having
			
 
				-                      --ringmaster.temp-dir=/tmp/hod-temp-dir.$PBS_JOBID would
			
 
				-                      let ringmaster use different temp-dir for each
			
 
				-                      allocation; Torque expands this variable before starting
			
 
				-                      the ringmaster.</li>
			
 
				-          
			
 
				-          <li>debug: Numeric value from 1-4. 4 produces the most log information,
			
 
				-                   and 1 the least.</li>
			
 
				-          
			
 
				-          <li>log-dir: Directory where log files are stored. By default, this is
			
 
				-                     &lt;install-location&gt;/logs/. The restrictions and notes for the
			
 
				-                     temp-dir variable apply here too.
			
 
				-          </li>
			
 
				-          
			
 
				-          <li>xrs-port-range: Range of ports, among which an available port shall
			
 
				-                            be picked for use to run an XML-RPC server.</li>
			
 
				-          
			
 
				-          <li>http-port-range: Range of ports, among which an available port shall
			
 
				-                             be picked for use to run an HTTP server.</li>
			
 
				-          
			
 
				-          <li>java-home: Location of Java to be used by Hadoop.</li>
			
 
				-          <li>syslog-address: Address to which a syslog daemon is bound to. The format 
			
 
				-                              of the value is host:port. If configured, HOD log messages
			
 
				-                              will be logged to syslog using this value.</li>
			
 
				-                              
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>3.2 hod options</title>
			
 
				-        
			
 
				-        <ul>
			
 
				-          <li>cluster: Descriptive name given to the cluster. For Torque, this is
			
 
				-                     specified as a 'Node property' for every node in the cluster.
			
 
				-                     HOD uses this value to compute the number of available nodes.</li>
			
 
				-          
			
 
				-          <li>client-params: Comma-separated list of hadoop config parameters
			
 
				-                           specified as key-value pairs. These will be used to
			
 
				-                           generate a hadoop-site.xml on the submit node that 
			
 
				-                           should be used for running Map/Reduce jobs.</li>
			
 
				-          <li>job-feasibility-attr: Regular expression string that specifies
			
 
				-                           whether and how to check job feasibility - resource
			
 
				-                           manager or scheduler limits. The current
			
 
				-                           implementation corresponds to the torque job
			
 
				-                           attribute 'comment' and by default is disabled.
			
 
				-                           When set, HOD uses it to decide what type
			
 
				-                           of limit violation is triggered and either
			
 
				-                           deallocates the cluster or stays in queued state
			
 
				-                           according as the request is beyond maximum limits or
			
 
				-                           the cumulative usage has crossed maximum limits. 
			
 
				-                           The torque comment attribute may be updated
			
 
				-                           periodically by an external mechanism. For example,
			
 
				-                           comment attribute can be updated by running <a href=
			
 
				-"hod_admin_guide.html#checklimits.sh+-+Tool+to+update+torque+comment+field+reflecting+resource+limits">
			
 
				-                           checklimits.sh</a> script in hod/support directory,
			
 
				-                           and then setting job-feasibility-attr equal to the
			
 
				-                           value TORQUE_USER_LIMITS_COMMENT_FIELD,
			
 
				-                           "User-limits exceeded. Requested:([0-9]*)
			
 
				-                           Used:([0-9]*) MaxLimit:([0-9]*)", will make HOD
			
 
				-                           behave accordingly.
			
 
				-                           </li>
			
 
				-         </ul>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>3.3 resource_manager options</title>
			
 
				-      
			
 
				-        <ul>
			
 
				-          <li>queue: Name of the queue configured in the resource manager to which
			
 
				-                   jobs are to be submitted.</li>
			
 
				-          
			
 
				-          <li>batch-home: Install directory to which 'bin' is appended and under 
			
 
				-                        which the executables of the resource manager can be 
			
 
				-                        found.</li> 
			
 
				-          
			
 
				-          <li>env-vars: Comma-separated list of key-value pairs, 
			
 
				-                      expressed as key=value, which would be passed to the jobs 
			
 
				-                      launched on the compute nodes. 
			
 
				-                      For example, if the python installation is 
			
 
				-                      in a non-standard location, one can set the environment
			
 
				-                      variable 'HOD_PYTHON_HOME' to the path to the python 
			
 
				-                      executable. The HOD processes launched on the compute nodes
			
 
				-                      can then use this variable.</li>
			
 
				-          <li>options: Comma-separated list of key-value pairs,
			
 
				-                      expressed as
			
 
				-                      &lt;option&gt;:&lt;sub-option&gt;=&lt;value&gt;. When
			
 
				-                      passing to the job submission program, these are expanded
			
 
				-                      as -&lt;option&gt; &lt;sub-option&gt;=&lt;value&gt;. These
			
 
				-                      are generally used for specifying additional resource
			
 
				-                      contraints for scheduling. For instance, with a Torque
			
 
				-                      setup, one can specify
			
 
				-                      --resource_manager.options='l:arch=x86_64' for
			
 
				-                      constraining the nodes being allocated to a particular
			
 
				-                      architecture; this option will be passed to Torque's qsub
			
 
				-                      command as "-l arch=x86_64".</li>
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>3.4 ringmaster options</title>
			
 
				-        
			
 
				-        <ul>
			
 
				-          <li>work-dirs: Comma-separated list of paths that will serve
			
 
				-                       as the root for directories that HOD generates and passes
			
 
				-                       to Hadoop for use to store DFS and Map/Reduce data. For
			
 
				-                       example,
			
 
				-                       this is where DFS data blocks will be stored. Typically,
			
 
				-                       as many paths are specified as there are disks available
			
 
				-                       to ensure all disks are being utilized. The restrictions
			
 
				-                       and notes for the temp-dir variable apply here too.</li>
			
 
				-          <li>max-master-failures: Number of times a hadoop master
			
 
				-                       daemon can fail to launch, beyond which HOD will fail
			
 
				-                       the cluster allocation altogether. In HOD clusters,
			
 
				-                       sometimes there might be a single or few "bad" nodes due
			
 
				-                       to issues like missing java, missing or incorrect version
			
 
				-                       of Hadoop etc. When this configuration variable is set
			
 
				-                       to a positive integer, the RingMaster returns an error
			
 
				-                       to the client only when the number of times a hadoop
			
 
				-                       master (JobTracker or NameNode) fails to start on these
			
 
				-                       bad nodes because of above issues, exceeds the specified
			
 
				-                       value. If the number is not exceeded, the next HodRing
			
 
				-                       which requests for a command to launch is given the same
			
 
				-                       hadoop master again. This way, HOD tries its best for a
			
 
				-                       successful allocation even in the presence of a few bad
			
 
				-                       nodes in the cluster.
			
 
				-                       </li>
			
 
				-          <li>workers_per_ring: Number of workers per service per HodRing.
			
 
				-                       By default this is set to 1. If this configuration
			
 
				-                       variable is set to a value 'n', the HodRing will run
			
 
				-                       'n' instances of the workers (TaskTrackers or DataNodes)
			
 
				-                       on each node acting as a slave. This can be used to run
			
 
				-                       multiple workers per HodRing, so that the total number of
			
 
				-                       workers  in a HOD cluster is not limited by the total
			
 
				-                       number of nodes requested during allocation. However, note
			
 
				-                       that this will mean each worker should be configured to use
			
 
				-                       only a proportional fraction of the capacity of the 
			
 
				-                       resources on the node. In general, this feature is only
			
 
				-                       useful for testing and simulation purposes, and not for
			
 
				-                       production use.</li>
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>3.5 gridservice-hdfs options</title>
			
 
				-        
			
 
				-        <ul>
			
 
				-          <li>external: If false, indicates that a HDFS cluster must be 
			
 
				-                      bought up by the HOD system, on the nodes which it 
			
 
				-                      allocates via the allocate command. Note that in that case,
			
 
				-                      when the cluster is de-allocated, it will bring down the 
			
 
				-                      HDFS cluster, and all the data will be lost.
			
 
				-                      If true, it will try and connect to an externally configured
			
 
				-                      HDFS system.
			
 
				-                      Typically, because input for jobs are placed into HDFS
			
 
				-                      before jobs are run, and also the output from jobs in HDFS 
			
 
				-                      is required to be persistent, an internal HDFS cluster is 
			
 
				-                      of little value in a production system. However, it allows 
			
 
				-                      for quick testing.</li>
			
 
				-          
			
 
				-          <li>host: Hostname of the externally configured NameNode, if any</li>
			
 
				-          
			
 
				-          <li>fs_port: Port to which NameNode RPC server is bound.</li>
			
 
				-          
			
 
				-          <li>info_port: Port to which the NameNode web UI server is bound.</li>
			
 
				-          
			
 
				-          <li>pkgs: Installation directory, under which bin/hadoop executable is 
			
 
				-                  located. This can be used to use a pre-installed version of
			
 
				-                  Hadoop on the cluster.</li>
			
 
				-          
			
 
				-          <li>server-params: Comma-separated list of hadoop config parameters
			
 
				-                           specified key-value pairs. These will be used to
			
 
				-                           generate a hadoop-site.xml that will be used by the
			
 
				-                           NameNode and DataNodes.</li>
			
 
				-          
			
 
				-          <li>final-server-params: Same as above, except they will be marked final.</li>
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>3.6 gridservice-mapred options</title>
			
 
				-        
			
 
				-        <ul>
			
 
				-          <li>external: If false, indicates that a Map/Reduce cluster must be
			
 
				-                      bought up by the HOD system on the nodes which it allocates
			
 
				-                      via the allocate command.
			
 
				-                      If true, if will try and connect to an externally 
			
 
				-                      configured Map/Reduce system.</li>
			
 
				-          
			
 
				-          <li>host: Hostname of the externally configured JobTracker, if any</li>
			
 
				-          
			
 
				-          <li>tracker_port: Port to which the JobTracker RPC server is bound</li>
			
 
				-          
			
 
				-          <li>info_port: Port to which the JobTracker web UI server is bound.</li>
			
 
				-          
			
 
				-          <li>pkgs: Installation directory, under which bin/hadoop executable is 
			
 
				-                  located</li>
			
 
				-          
			
 
				-          <li>server-params: Comma-separated list of hadoop config parameters
			
 
				-                           specified key-value pairs. These will be used to
			
 
				-                           generate a hadoop-site.xml that will be used by the
			
 
				-                           JobTracker and TaskTrackers</li>
			
 
				-          
			
 
				-          <li>final-server-params: Same as above, except they will be marked final.</li>
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>3.7 hodring options</title>
			
 
				-
			
 
				-        <ul>
			
 
				-          <li>mapred-system-dir-root: Directory in the DFS under which HOD will
			
 
				-                                      generate sub-directory names and pass the full path
			
 
				-                                      as the value of the 'mapred.system.dir' configuration 
			
 
				-                                      parameter to Hadoop daemons. The format of the full 
			
 
				-                                      path will be value-of-this-option/userid/mapredsystem/cluster-id.
			
 
				-                                      Note that the directory specified here should be such
			
 
				-                                      that all users can create directories under this, if
			
 
				-                                      permissions are enabled in HDFS. Setting the value of
			
 
				-                                      this option to /user will make HOD use the user's
			
 
				-                                      home directory to generate the mapred.system.dir value.</li>
			
 
				-
			
 
				-          <li>log-destination-uri: URL describing a path in an external, static DFS or the 
			
 
				-                                   cluster node's local file system where HOD will upload 
			
 
				-                                   Hadoop logs when a cluster is deallocated. To specify a 
			
 
				-                                   DFS path, use the format 'hdfs://path'. To specify a 
			
 
				-                                   cluster node's local file path, use the format 'file://path'.
			
 
				-
			
 
				-                                   When clusters are deallocated by HOD, the hadoop logs will
			
 
				-                                   be deleted as part of HOD's cleanup process. To ensure these
			
 
				-                                   logs persist, you can use this configuration option.
			
 
				-
			
 
				-                                   The format of the path is 
			
 
				-                                   value-of-this-option/userid/hod-logs/cluster-id
			
 
				-
			
 
				-                                   Note that the directory you specify here must be such that all
			
 
				-                                   users can create sub-directories under this. Setting this value
			
 
				-                                   to hdfs://user will make the logs come in the user's home directory
			
 
				-                                   in DFS.</li>
			
 
				-
			
 
				-          <li>pkgs: Installation directory, under which bin/hadoop executable is located. This will
			
 
				-                    be used by HOD to upload logs if a HDFS URL is specified in log-destination-uri
			
 
				-                    option. Note that this is useful if the users are using a tarball whose version
			
 
				-                    may differ from the external, static HDFS version.</li>
			
 
				-
			
 
				-          <li>hadoop-port-range: Range of ports, among which an available port shall
			
 
				-                             be picked for use to run a Hadoop Service, like JobTracker or TaskTracker. </li>
			
 
				-          
			
 
				-                                      
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-  </body>
			
 
				-</document>
			
 
				-
			
--- a/src/docs/src/documentation/content/xdocs/hod_user_guide.xml
+++ b/src/docs/src/documentation/content/xdocs/hod_user_guide.xml
@@ -1,561 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
			
 
				-          "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-<document>
			
 
				-  <header>
			
 
				-    <title>
			
 
				-      HOD User Guide
			
 
				-    </title>
			
 
				-  </header>
			
 
				-
			
 
				-<body>
			
 
				-  <section>
			
 
				-    <title> Introduction </title><anchor id="Introduction"></anchor>
			
 
				-  <p>Hadoop On Demand (HOD) is a system for provisioning virtual Hadoop clusters over a large physical cluster. It uses the Torque resource manager to do node allocation. On the allocated nodes, it can start Hadoop Map/Reduce and HDFS daemons. It automatically generates the appropriate configuration files (hadoop-site.xml) for the Hadoop daemons and client. HOD also has the capability to distribute Hadoop to the nodes in the virtual cluster that it allocates. In short, HOD makes it easy for administrators and users to quickly setup and use Hadoop. It is also a very useful tool for Hadoop developers and testers who need to share a physical cluster for testing their own Hadoop versions.</p>
			
 
				-  <p>HOD supports Hadoop from version 0.15 onwards.</p>
			
 
				-  <p>This guide shows you how to get started using HOD, reviews various HOD features and command line options, and provides detailed troubleshooting help.</p>
			
 
				-  </section>
			
 
				-  <section>
			
 
				-		<title> Getting Started Using HOD </title><anchor id="Getting_Started_Using_HOD_0_4"></anchor>
			
 
				-  <p>In this section, we shall see a step-by-step introduction on how to use HOD for the most basic operations. Before following these steps, it is assumed that HOD and its dependent hardware and software components are setup and configured correctly. This is a step that is generally performed by system administrators of the cluster.</p>
			
 
				-  <p>The HOD user interface is a command line utility called <code>hod</code>. It is driven by a configuration file, that is typically setup for users by system administrators. Users can override this configuration when using the <code>hod</code>, which is described later in this documentation. The configuration file can be specified in two ways when using <code>hod</code>, as described below: </p>
			
 
				-  <ul>
			
 
				-    <li> Specify it on command line, using the -c option. Such as <code>hod &lt;operation&gt; &lt;required-args&gt; -c path-to-the-configuration-file [other-options]</code></li>
			
 
				-    <li> Set up an environment variable <em>HOD_CONF_DIR</em> where <code>hod</code> will be run. This should be pointed to a directory on the local file system, containing a file called <em>hodrc</em>. Note that this is analogous to the <em>HADOOP_CONF_DIR</em> and <em>hadoop-site.xml</em> file for Hadoop. If no configuration file is specified on the command line, <code>hod</code> shall look for the <em>HOD_CONF_DIR</em> environment variable and a <em>hodrc</em> file under that.</li>
			
 
				-    </ul>
			
 
				-  <p>In examples listed below, we shall not explicitly point to the configuration option, assuming it is correctly specified.</p>
			
 
				-  <section><title>A typical HOD session</title><anchor id="HOD_Session"></anchor>
			
 
				-  <p>A typical session of HOD will involve at least three steps: allocate, run hadoop jobs, deallocate. In order to do this, perform the following steps.</p>
			
 
				-  <p><strong> Create a Cluster Directory </strong></p><anchor id="Create_a_Cluster_Directory"></anchor>
			
 
				-  <p>The <em>cluster directory</em> is a directory on the local file system where <code>hod</code> will generate the Hadoop configuration, <em>hadoop-site.xml</em>, corresponding to the cluster it allocates. Pass this directory to the <code>hod</code> operations as stated below. If the cluster directory passed doesn't already exist, HOD will automatically try to create it and use it. Once a cluster is allocated, a user can utilize it to run Hadoop jobs by specifying the cluster directory as the Hadoop --config option. </p>
			
 
				-  <p><strong> Operation <em>allocate</em></strong></p><anchor id="Operation_allocate"></anchor>
			
 
				-  <p>The <em>allocate</em> operation is used to allocate a set of nodes and install and provision Hadoop on them. It has the following syntax. Note that it requires a cluster_dir ( -d, --hod.clusterdir) and the number of nodes (-n, --hod.nodecount) needed to be allocated:</p>
			
 
				-    <table>
			
 
				-      
			
 
				-        <tr>
			
 
				-          <td><code>$ hod allocate -d cluster_dir -n number_of_nodes [OPTIONS]</code></td>
			
 
				-        </tr>
			
 
				-      
			
 
				-    </table>
			
 
				-  <p>If the command completes successfully, then <code>cluster_dir/hadoop-site.xml</code> will be generated and will contain information about the allocated cluster. It will also print out the information about the Hadoop web UIs.</p>
			
 
				-  <p>An example run of this command produces the following output. Note in this example that <code>~/hod-clusters/test</code> is the cluster directory, and we are allocating 5 nodes:</p>
			
 
				-  <table>
			
 
				-    <tr>
			
 
				-      <td><code>$ hod allocate -d ~/hod-clusters/test -n 5</code><br/>
			
 
				-      <code>INFO - HDFS UI on http://foo1.bar.com:53422</code><br/>
			
 
				-      <code>INFO - Mapred UI on http://foo2.bar.com:55380</code><br/></td>
			
 
				-      </tr>
			
 
				-   </table>
			
 
				-  <p><strong> Running Hadoop jobs using the allocated cluster </strong></p><anchor id="Running_Hadoop_jobs_using_the_al"></anchor>
			
 
				-  <p>Now, one can run Hadoop jobs using the allocated cluster in the usual manner. This assumes variables like <em>JAVA_HOME</em> and path to the Hadoop installation are set up correctly.:</p>
			
 
				-    <table>
			
 
				-      
			
 
				-        <tr>
			
 
				-          <td><code>$ hadoop --config cluster_dir hadoop_command hadoop_command_args</code></td>
			
 
				-        </tr>
			
 
				-      
			
 
				-    </table>
			
 
				-  <p>or</p>
			
 
				-    <table>
			
 
				-      
			
 
				-        <tr>
			
 
				-          <td><code>$ export HADOOP_CONF_DIR=cluster_dir</code> <br />
			
 
				-              <code>$ hadoop hadoop_command hadoop_command_args</code></td>
			
 
				-        </tr>
			
 
				-      
			
 
				-    </table>
			
 
				-  <p>Continuing our example, the following command will run a wordcount example on the allocated cluster:</p>
			
 
				-  <table><tr><td><code>$ hadoop --config ~/hod-clusters/test jar /path/to/hadoop/hadoop-examples.jar wordcount /path/to/input /path/to/output</code></td></tr></table>
			
 
				-  <p>or</p>
			
 
				-  <table><tr>
			
 
				-    <td><code>$ export HADOOP_CONF_DIR=~/hod-clusters/test</code><br />
			
 
				-    <code>$ hadoop jar /path/to/hadoop/hadoop-examples.jar wordcount /path/to/input /path/to/output</code></td>
			
 
				-    </tr>
			
 
				-  </table>
			
 
				-  <p><strong> Operation <em>deallocate</em></strong></p><anchor id="Operation_deallocate"></anchor>
			
 
				-  <p>The <em>deallocate</em> operation is used to release an allocated cluster. When finished with a cluster, deallocate must be run so that the nodes become free for others to use. The <em>deallocate</em> operation has the following syntax. Note that it requires the cluster_dir (-d, --hod.clusterdir) argument:</p>
			
 
				-    <table>
			
 
				-      
			
 
				-        <tr>
			
 
				-          <td><code>$ hod deallocate -d cluster_dir</code></td>
			
 
				-        </tr>
			
 
				-      
			
 
				-    </table>
			
 
				-  <p>Continuing our example, the following command will deallocate the cluster:</p>
			
 
				-  <table><tr><td><code>$ hod deallocate -d ~/hod-clusters/test</code></td></tr></table>
			
 
				-  <p>As can be seen, HOD allows the users to allocate a cluster, and use it flexibly for running Hadoop jobs. For example, users can run multiple jobs in parallel on the same cluster, by running hadoop from multiple shells pointing to the same configuration.</p>
			
 
				-	</section>
			
 
				-  <section><title>Running hadoop scripts using HOD</title><anchor id="HOD_Script_Mode"></anchor>
			
 
				-  <p>The HOD <em>script operation</em> combines the operations of allocating, using and deallocating a cluster into a single operation. This is very useful for users who want to run a script of hadoop jobs and let HOD handle the cleanup automatically once the script completes. In order to run hadoop scripts using <code>hod</code>, do the following:</p>
			
 
				-  <p><strong> Create a script file </strong></p><anchor id="Create_a_script_file"></anchor>
			
 
				-  <p>This will be a regular shell script that will typically contain hadoop commands, such as:</p>
			
 
				-  <table><tr><td><code>$ hadoop jar jar_file options</code></td>
			
 
				-  </tr></table>
			
 
				-  <p>However, the user can add any valid commands as part of the script. HOD will execute this script setting <em>HADOOP_CONF_DIR</em> automatically to point to the allocated cluster. So users do not need to worry about this. The users however need to specify a cluster directory just like when using the allocate operation.</p>
			
 
				-  <p><strong> Running the script </strong></p><anchor id="Running_the_script"></anchor>
			
 
				-  <p>The syntax for the <em>script operation</em> as is as follows. Note that it requires a cluster directory ( -d, --hod.clusterdir), number of nodes (-n, --hod.nodecount) and a script file (-s, --hod.script):</p>
			
 
				-    <table>
			
 
				-      
			
 
				-        <tr>
			
 
				-          <td><code>$ hod script -d cluster_directory -n number_of_nodes -s script_file</code></td>
			
 
				-        </tr>
			
 
				-      
			
 
				-    </table>
			
 
				-  <p>Note that HOD will deallocate the cluster as soon as the script completes, and this means that the script must not complete until the hadoop jobs themselves are completed. Users must take care of this while writing the script. </p>
			
 
				-   </section>
			
 
				-  </section>
			
 
				-  <section>
			
 
				-		<title> HOD Features </title><anchor id="HOD_0_4_Features"></anchor>
			
 
				-  <section><title> Provisioning and Managing Hadoop Clusters </title><anchor id="Provisioning_and_Managing_Hadoop"></anchor>
			
 
				-  <p>The primary feature of HOD is to provision Hadoop Map/Reduce and HDFS clusters. This is described above in the Getting Started section. Also, as long as nodes are available, and organizational policies allow, a user can use HOD to allocate multiple Map/Reduce clusters simultaneously. The user would need to specify different paths for the <code>cluster_dir</code> parameter mentioned above for each cluster he/she allocates. HOD provides the <em>list</em> and the <em>info</em> operations to enable managing multiple clusters.</p>
			
 
				-  <p><strong> Operation <em>list</em></strong></p><anchor id="Operation_list"></anchor>
			
 
				-  <p>The list operation lists all the clusters allocated so far by a user. The cluster directory where the hadoop-site.xml is stored for the cluster, and its status vis-a-vis connectivity with the JobTracker and/or HDFS is shown. The list operation has the following syntax:</p>
			
 
				-    <table>
			
 
				-      
			
 
				-        <tr>
			
 
				-          <td><code>$ hod list</code></td>
			
 
				-        </tr>
			
 
				-      
			
 
				-    </table>
			
 
				-  <p><strong> Operation <em>info</em></strong></p><anchor id="Operation_info"></anchor>
			
 
				-  <p>The info operation shows information about a given cluster. The information shown includes the Torque job id, and locations of the important daemons like the HOD Ringmaster process, and the Hadoop JobTracker and NameNode daemons. The info operation has the following syntax. Note that it requires a cluster directory (-d, --hod.clusterdir):</p>
			
 
				-    <table>
			
 
				-      
			
 
				-        <tr>
			
 
				-          <td><code>$ hod info -d cluster_dir</code></td>
			
 
				-        </tr>
			
 
				-      
			
 
				-    </table>
			
 
				-  <p>The <code>cluster_dir</code> should be a valid cluster directory specified in an earlier <em>allocate</em> operation.</p>
			
 
				-  </section>
			
 
				-  <section><title> Using a tarball to distribute Hadoop </title><anchor id="Using_a_tarball_to_distribute_Ha"></anchor>
			
 
				-  <p>When provisioning Hadoop, HOD can use either a pre-installed Hadoop on the cluster nodes or distribute and install a Hadoop tarball as part of the provisioning operation. If the tarball option is being used, there is no need to have a pre-installed Hadoop on the cluster nodes, nor a need to use a pre-installed one. This is especially useful in a development / QE environment where individual developers may have different versions of Hadoop to test on a shared cluster. </p>
			
 
				-  <p>In order to use a pre-installed Hadoop, you must specify, in the hodrc, the <code>pkgs</code> option in the <code>gridservice-hdfs</code> and <code>gridservice-mapred</code> sections. This must point to the path where Hadoop is installed on all nodes of the cluster.</p>
			
 
				-  <p>The syntax for specifying tarball is as follows:</p>
			
 
				-    <table>
			
 
				-        <tr>
			
 
				-          <td><code>$ hod allocate -d cluster_dir -n number_of_nodes -t hadoop_tarball_location</code></td>
			
 
				-        </tr>
			
 
				-    </table>
			
 
				-  <p>For example, the following command allocates Hadoop provided by the tarball <code>~/share/hadoop.tar.gz</code>:</p>
			
 
				-  <table><tr><td><code>$ hod allocate -d ~/hadoop-cluster -n 10 -t ~/share/hadoop.tar.gz</code></td></tr></table>
			
 
				-  <p>Similarly, when using hod script, the syntax is as follows:</p>
			
 
				-    <table>
			
 
				-        <tr>
			
 
				-          <td><code>$ hod script -d cluster_directory -s script_file -n number_of_nodes -t hadoop_tarball_location</code></td>
			
 
				-        </tr>
			
 
				-    </table>
			
 
				-  <p>The hadoop_tarball specified in the syntax above should point to a path on a shared file system that is accessible from all the compute nodes. Currently, HOD only supports NFS mounted file systems.</p>
			
 
				-  <p><em>Note:</em></p>
			
 
				-  <ul>
			
 
				-    <li> For better distribution performance it is recommended that the Hadoop tarball contain only the libraries and binaries, and not the source or documentation.</li>
			
 
				-    <li> When you want to run jobs against a cluster allocated using the tarball, you must use a compatible version of hadoop to submit your jobs. The best would be to untar and use the version that is present in the tarball itself.</li>
			
 
				-    <li> You need to make sure that there are no Hadoop configuration files, hadoop-env.sh and hadoop-site.xml, present in the conf directory of the tarred distribution. The presence of these files with incorrect values could make the cluster allocation to fail.</li>
			
 
				-  </ul>
			
 
				-  </section>
			
 
				-  <section><title> Using an external HDFS </title><anchor id="Using_an_external_HDFS"></anchor>
			
 
				-  <p>In typical Hadoop clusters provisioned by HOD, HDFS is already set up statically (without using HOD). This allows data to persist in HDFS after the HOD provisioned clusters is deallocated. To use a statically configured HDFS, your hodrc must point to an external HDFS. Specifically, set the following options to the correct values in the section <code>gridservice-hdfs</code> of the hodrc:</p>
			
 
				-   <table><tr><td>external = true</td></tr><tr><td>host = Hostname of the HDFS NameNode</td></tr><tr><td>fs_port = Port number of the HDFS NameNode</td></tr><tr><td>info_port = Port number of the HDFS NameNode web UI</td></tr></table>
			
 
				-  <p><em>Note:</em> You can also enable this option from command line. That is, to use a static HDFS, you will need to say: <br />
			
 
				-    </p>
			
 
				-    <table>
			
 
				-        <tr>
			
 
				-          <td><code>$ hod allocate -d cluster_dir -n number_of_nodes --gridservice-hdfs.external</code></td>
			
 
				-        </tr>
			
 
				-    </table>
			
 
				-  <p>HOD can be used to provision an HDFS cluster as well as a Map/Reduce cluster, if required. To do so, set the following option in the section <code>gridservice-hdfs</code> of the hodrc:</p>
			
 
				-  <table><tr><td>external = false</td></tr></table>
			
 
				-  </section>
			
 
				-  <section><title> Options for Configuring Hadoop </title><anchor id="Options_for_Configuring_Hadoop"></anchor>
			
 
				-  <p>HOD provides a very convenient mechanism to configure both the Hadoop daemons that it provisions and also the hadoop-site.xml that it generates on the client side. This is done by specifying Hadoop configuration parameters in either the HOD configuration file, or from the command line when allocating clusters.</p>
			
 
				-  <p><strong> Configuring Hadoop Daemons </strong></p><anchor id="Configuring_Hadoop_Daemons"></anchor>
			
 
				-  <p>For configuring the Hadoop daemons, you can do the following:</p>
			
 
				-  <p>For Map/Reduce, specify the options as a comma separated list of key-value pairs to the <code>server-params</code> option in the <code>gridservice-mapred</code> section. Likewise for a dynamically provisioned HDFS cluster, specify the options in the <code>server-params</code> option in the <code>gridservice-hdfs</code> section. If these parameters should be marked as <em>final</em>, then include these in the <code>final-server-params</code> option of the appropriate section.</p>
			
 
				-  <p>For example:</p>
			
 
				-  <table><tr><td><code>server-params = mapred.reduce.parallel.copies=20,io.sort.factor=100,io.sort.mb=128,io.file.buffer.size=131072</code></td></tr><tr><td><code>final-server-params = mapred.child.java.opts=-Xmx512m,dfs.block.size=134217728,fs.inmemory.size.mb=128</code></td>
			
 
				-  </tr></table>
			
 
				-  <p>In order to provide the options from command line, you can use the following syntax:</p>
			
 
				-  <p>For configuring the Map/Reduce daemons use:</p>
			
 
				-    <table>
			
 
				-        <tr>
			
 
				-          <td><code>$ hod allocate -d cluster_dir -n number_of_nodes -Mmapred.reduce.parallel.copies=20 -Mio.sort.factor=100</code></td>
			
 
				-        </tr>
			
 
				-    </table>
			
 
				-  <p>In the example above, the <em>mapred.reduce.parallel.copies</em> parameter and the <em>io.sort.factor</em> parameter will be appended to the other <code>server-params</code> or if they already exist in <code>server-params</code>, will override them. In order to specify these are <em>final</em> parameters, you can use:</p>
			
 
				-    <table>
			
 
				-        <tr>
			
 
				-          <td><code>$ hod allocate -d cluster_dir -n number_of_nodes -Fmapred.reduce.parallel.copies=20 -Fio.sort.factor=100</code></td>
			
 
				-        </tr>
			
 
				-    </table>
			
 
				-  <p>However, note that final parameters cannot be overwritten from command line. They can only be appended if not already specified.</p>
			
 
				-  <p>Similar options exist for configuring dynamically provisioned HDFS daemons. For doing so, replace -M with -H and -F with -S.</p>
			
 
				-  <p><strong> Configuring Hadoop Job Submission (Client) Programs </strong></p><anchor id="Configuring_Hadoop_Job_Submissio"></anchor>
			
 
				-  <p>As mentioned above, if the allocation operation completes successfully then <code>cluster_dir/hadoop-site.xml</code> will be generated and will contain information about the allocated cluster's JobTracker and NameNode. This configuration is used when submitting jobs to the cluster. HOD provides an option to include additional Hadoop configuration parameters into this file. The syntax for doing so is as follows:</p>
			
 
				-    <table>
			
 
				-        <tr>
			
 
				-          <td><code>$ hod allocate -d cluster_dir -n number_of_nodes -Cmapred.userlog.limit.kb=200 -Cmapred.child.java.opts=-Xmx512m</code></td>
			
 
				-        </tr>
			
 
				-    </table>
			
 
				-  <p>In this example, the <em>mapred.userlog.limit.kb</em> and <em>mapred.child.java.opts</em> options will be included into the hadoop-site.xml that is generated by HOD.</p>
			
 
				-  </section>
			
 
				-  <section><title> Viewing Hadoop Web-UIs </title><anchor id="Viewing_Hadoop_Web_UIs"></anchor>
			
 
				-  <p>The HOD allocation operation prints the JobTracker and NameNode web UI URLs. For example:</p>
			
 
				-   <table><tr><td><code>$ hod allocate -d ~/hadoop-cluster -n 10 -c ~/hod-conf-dir/hodrc</code><br/>
			
 
				-    <code>INFO - HDFS UI on http://host242.foo.com:55391</code><br/>
			
 
				-    <code>INFO - Mapred UI on http://host521.foo.com:54874</code>
			
 
				-    </td></tr></table>
			
 
				-  <p>The same information is also available via the <em>info</em> operation described above.</p>
			
 
				-  </section>
			
 
				-  <section><title> Collecting and Viewing Hadoop Logs </title><anchor id="Collecting_and_Viewing_Hadoop_Lo"></anchor>
			
 
				-  <p>To get the Hadoop logs of the daemons running on one of the allocated nodes: </p>
			
 
				-  <ul>
			
 
				-    <li> Log into the node of interest. If you want to look at the logs of the JobTracker or NameNode, then you can find the node running these by using the <em>list</em> and <em>info</em> operations mentioned above.</li>
			
 
				-    <li> Get the process information of the daemon of interest (for example, <code>ps ux | grep TaskTracker</code>)</li>
			
 
				-    <li> In the process information, search for the value of the variable <code>-Dhadoop.log.dir</code>. Typically this will be a decendent directory of the <code>hodring.temp-dir</code> value from the hod configuration file.</li>
			
 
				-    <li> Change to the <code>hadoop.log.dir</code> directory to view daemon and user logs.</li>
			
 
				-  </ul>
			
 
				-  <p>HOD also provides a mechanism to collect logs when a cluster is being deallocated and persist them into a file system, or an externally configured HDFS. By doing so, these logs can be viewed after the jobs are completed and the nodes are released. In order to do so, configure the log-destination-uri to a URI as follows:</p>
			
 
				-   <table><tr><td><code>log-destination-uri = hdfs://host123:45678/user/hod/logs</code> or</td></tr>
			
 
				-    <tr><td><code>log-destination-uri = file://path/to/store/log/files</code></td></tr>
			
 
				-    </table>
			
 
				-  <p>Under the root directory specified above in the path, HOD will create a path user_name/torque_jobid and store gzipped log files for each node that was part of the job.</p>
			
 
				-  <p>Note that to store the files to HDFS, you may need to configure the <code>hodring.pkgs</code> option with the Hadoop version that matches the HDFS mentioned. If not, HOD will try to use the Hadoop version that it is using to provision the Hadoop cluster itself.</p>
			
 
				-  </section>
			
 
				-  <section><title> Auto-deallocation of Idle Clusters </title><anchor id="Auto_deallocation_of_Idle_Cluste"></anchor>
			
 
				-  <p>HOD automatically deallocates clusters that are not running Hadoop jobs for a given period of time. Each HOD allocation includes a monitoring facility that constantly checks for running Hadoop jobs. If it detects no running Hadoop jobs for a given period, it will automatically deallocate its own cluster and thus free up nodes which are not being used effectively.</p>
			
 
				-  <p><em>Note:</em> While the cluster is deallocated, the <em>cluster directory</em> is not cleaned up automatically. The user must deallocate this cluster through the regular <em>deallocate</em> operation to clean this up.</p>
			
 
				-	</section>
			
 
				-  <section><title> Specifying Additional Job Attributes </title><anchor id="Specifying_Additional_Job_Attrib"></anchor>
			
 
				-  <p>HOD allows the user to specify a wallclock time and a name (or title) for a Torque job. </p>
			
 
				-  <p>The wallclock time is the estimated amount of time for which the Torque job will be valid. After this time has expired, Torque will automatically delete the job and free up the nodes. Specifying the wallclock time can also help the job scheduler to better schedule jobs, and help improve utilization of cluster resources.</p>
			
 
				-  <p>To specify the wallclock time, use the following syntax:</p>
			
 
				-    <table>
			
 
				-        <tr>
			
 
				-          <td><code>$ hod allocate -d cluster_dir -n number_of_nodes -l time_in_seconds</code></td>
			
 
				-        </tr>
			
 
				-    </table>
			
 
				-  <p>The name or title of a Torque job helps in user friendly identification of the job. The string specified here will show up in all information where Torque job attributes are displayed, including the <code>qstat</code> command.</p>
			
 
				-  <p>To specify the name or title, use the following syntax:</p>
			
 
				-    <table>
			
 
				-        <tr>
			
 
				-          <td><code>$ hod allocate -d cluster_dir -n number_of_nodes -N name_of_job</code></td>
			
 
				-        </tr>
			
 
				-    </table>
			
 
				-  <p><em>Note:</em> Due to restriction in the underlying Torque resource manager, names which do not start with an alphabet character or contain a 'space' will cause the job to fail. The failure message points to the problem being in the specified job name.</p>
			
 
				-  </section>
			
 
				-  <section><title> Capturing HOD exit codes in Torque </title><anchor id="Capturing_HOD_exit_codes_in_Torq"></anchor>
			
 
				-  <p>HOD exit codes are captured in the Torque exit_status field. This will help users and system administrators to distinguish successful runs from unsuccessful runs of HOD. The exit codes are 0 if allocation succeeded and all hadoop jobs ran on the allocated cluster correctly. They are non-zero if allocation failed or some of the hadoop jobs failed on the allocated cluster. The exit codes that are possible are mentioned in the table below. <em>Note: Hadoop job status is captured only if the version of Hadoop used is 16 or above.</em></p>
			
 
				-  <table>
			
 
				-    
			
 
				-      <tr>
			
 
				-        <td> Exit Code </td>
			
 
				-        <td> Meaning </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 6 </td>
			
 
				-        <td> Ringmaster failure </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 7 </td>
			
 
				-        <td> HDFS failure </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 8 </td>
			
 
				-        <td> Job tracker failure </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 10 </td>
			
 
				-        <td> Cluster dead </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 12 </td>
			
 
				-        <td> Cluster already allocated </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 13 </td>
			
 
				-        <td> HDFS dead </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 14 </td>
			
 
				-        <td> Mapred dead </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 16 </td>
			
 
				-        <td> All Map/Reduce jobs that ran on the cluster failed. Refer to hadoop logs for more details. </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 17 </td>
			
 
				-        <td> Some of the Map/Reduce jobs that ran on the cluster failed. Refer to hadoop logs for more details. </td>
			
 
				-      </tr>
			
 
				-    
			
 
				-  </table>
			
 
				-  </section>
			
 
				-  <section>
			
 
				-    <title> Command Line</title><anchor id="Command_Line"></anchor>
			
 
				-    <p>HOD command line has the following general syntax:<br/>
			
 
				-      <em>hod &lt;operation&gt; [ARGS] [OPTIONS]<br/></em>
			
 
				-      Allowed operations are 'allocate', 'deallocate', 'info', 'list', 'script' and 'help'. For help on a particular operation one can do : <code>hod help &lt;operation&gt;</code>. To have a look at possible options one can do a <code>hod help options.</code></p>
			
 
				-      <p><em>allocate</em><br />
			
 
				-      <em>Usage : hod allocate -d cluster_dir -n number_of_nodes [OPTIONS]</em><br />
			
 
				-        Allocates a cluster on the given number of cluster nodes, and store the allocation information in cluster_dir for use with subsequent <code>hadoop</code> commands. Note that the <code>cluster_dir</code> must exist before running the command.</p>
			
 
				-      <p><em>list</em><br/>
			
 
				-      <em>Usage : hod list [OPTIONS]</em><br />
			
 
				-       Lists the clusters allocated by this user. Information provided includes the Torque job id corresponding to the cluster, the cluster directory where the allocation information is stored, and whether the Map/Reduce daemon is still active or not.</p>
			
 
				-      <p><em>info</em><br/>
			
 
				-      <em>Usage : hod info -d cluster_dir [OPTIONS]</em><br />
			
 
				-        Lists information about the cluster whose allocation information is stored in the specified cluster directory.</p>
			
 
				-      <p><em>deallocate</em><br/>
			
 
				-      <em>Usage : hod deallocate -d cluster_dir [OPTIONS]</em><br />
			
 
				-        Deallocates the cluster whose allocation information is stored in the specified cluster directory.</p>
			
 
				-      <p><em>script</em><br/>
			
 
				-      <em>Usage : hod script -s script_file -d cluster_directory -n number_of_nodes [OPTIONS]</em><br />
			
 
				-        Runs a hadoop script using HOD<em>script</em> operation. Provisions Hadoop on a given number of nodes, executes the given script from the submitting node, and deallocates the cluster when the script completes.</p>
			
 
				-      <p><em>help</em><br/>
			
 
				-      <em>Usage : hod help [operation | 'options']</em><br/>
			
 
				-       When no argument is specified, <code>hod help</code> gives the usage and basic options, and is equivalent to <code>hod --help</code> (See below). When 'options' is given as argument, hod displays only the basic options that hod takes. When an operation is specified, it displays the usage and description corresponding to that particular operation. For e.g, to know about allocate operation, one can do a <code>hod help allocate</code></p>
			
 
				-      <p>Besides the operations, HOD can take the following command line options.</p>
			
 
				-      <p><em>--help</em><br />
			
 
				-        Prints out the help message to see the usage and basic options.</p>
			
 
				-      <p><em>--verbose-help</em><br />
			
 
				-        All configuration options provided in the hodrc file can be passed on the command line, using the syntax <code>--section_name.option_name[=value]</code>. When provided this way, the value provided on command line overrides the option provided in hodrc. The verbose-help command lists all the available options in the hodrc file. This is also a nice way to see the meaning of the configuration options.</p>
			
 
				-       <p>See the <a href="#Options_Configuring_HOD">next section</a> for a description of most important hod configuration options. For basic options, one can do a <code>hod help options</code> and for all options possible in hod configuration, one can see <code>hod --verbose-help</code>. See <a href="hod_config_guide.html">config guide</a> for a description of all options.</p>
			
 
				-  </section>
			
 
				-
			
 
				-  <section><title> Options Configuring HOD </title><anchor id="Options_Configuring_HOD"></anchor>
			
 
				-  <p>As described above, HOD is configured using a configuration file that is usually set up by system administrators. This is a INI style configuration file that is divided into sections, and options inside each section. Each section relates to one of the HOD processes: client, ringmaster, hodring, mapreduce or hdfs. The options inside a section comprise of an option name and value. </p>
			
 
				-  <p>Users can override the configuration defined in the default configuration in two ways: </p>
			
 
				-  <ul>
			
 
				-    <li> Users can supply their own configuration file to HOD in each of the commands, using the <code>-c</code> option</li>
			
 
				-    <li> Users can supply specific configuration options to HOD/ Options provided on command line <em>override</em> the values provided in the configuration file being used.</li>
			
 
				-  </ul>
			
 
				-  <p>This section describes some of the most commonly used configuration options. These commonly used options are provided with a <em>short</em> option for convenience of specification. All other options can be specified using a <em>long</em> option that is also described below.</p>
			
 
				-  <p><em>-c config_file</em><br />
			
 
				-    Provides the configuration file to use. Can be used with all other options of HOD. Alternatively, the <code>HOD_CONF_DIR</code> environment variable can be defined to specify a directory that contains a file named <code>hodrc</code>, alleviating the need to specify the configuration file in each HOD command.</p>
			
 
				-  <p><em>-d cluster_dir</em><br />
			
 
				-        This is required for most of the hod operations. As described <a href="#Create_a_Cluster_Directory">here</a>, the <em>cluster directory</em> is a directory on the local file system where <code>hod</code> will generate the Hadoop configuration, <em>hadoop-site.xml</em>, corresponding to the cluster it allocates. Pass it to the <code>hod</code> operations as an argument to -d or --hod.clusterdir. If it doesn't already exist, HOD will automatically try to create it and use it. Once a cluster is allocated, a user can utilize it to run Hadoop jobs by specifying the clusterdirectory as the Hadoop --config option.</p>
			
 
				-  <p><em>-n number_of_nodes</em><br />
			
 
				-  This is required for the hod 'allocation' operation and for script operation. This denotes the number of nodes to be allocated.</p>
			
 
				-  <p><em>-s script-file</em><br/>
			
 
				-   Required when using script operation, specifies the script file to execute.</p>
			
 
				- <p><em>-b 1|2|3|4</em><br />
			
 
				-    Enables the given debug level. Can be used with all other options of HOD. 4 is most verbose.</p>
			
 
				-  <p><em>-t hadoop_tarball</em><br />
			
 
				-    Provisions Hadoop from the given tar.gz file. This option is only applicable to the <em>allocate</em> operation. For better distribution performance it is strongly recommended that the Hadoop tarball is created <em>after</em> removing the source or documentation.</p>
			
 
				-  <p><em>-N job-name</em><br />
			
 
				-    The Name to give to the resource manager job that HOD uses underneath. For e.g. in the case of Torque, this translates to the <code>qsub -N</code> option, and can be seen as the job name using the <code>qstat</code> command.</p>
			
 
				-  <p><em>-l wall-clock-time</em><br />
			
 
				-    The amount of time for which the user expects to have work on the allocated cluster. This is passed to the resource manager underneath HOD, and can be used in more efficient scheduling and utilization of the cluster. Note that in the case of Torque, the cluster is automatically deallocated after this time expires.</p>
			
 
				-  <p><em>-j java-home</em><br />
			
 
				-    Path to be set to the JAVA_HOME environment variable. This is used in the <em>script</em> operation. HOD sets the JAVA_HOME environment variable tot his value and launches the user script in that.</p>
			
 
				-  <p><em>-A account-string</em><br />
			
 
				-    Accounting information to pass to underlying resource manager.</p>
			
 
				-  <p><em>-Q queue-name</em><br />
			
 
				-    Name of the queue in the underlying resource manager to which the job must be submitted.</p>
			
 
				-  <p><em>-Mkey1=value1 -Mkey2=value2</em><br />
			
 
				-    Provides configuration parameters for the provisioned Map/Reduce daemons (JobTracker and TaskTrackers). A hadoop-site.xml is generated with these values on the cluster nodes. <br />
			
 
				-    <em>Note:</em> Values which have the following characters: space, comma, equal-to, semi-colon need to be escaped with a '\' character, and need to be enclosed within quotes. You can escape a '\' with a '\' too. </p>
			
 
				-  <p><em>-Hkey1=value1 -Hkey2=value2</em><br />
			
 
				-    Provides configuration parameters for the provisioned HDFS daemons (NameNode and DataNodes). A hadoop-site.xml is generated with these values on the cluster nodes <br />
			
 
				-    <em>Note:</em> Values which have the following characters: space, comma, equal-to, semi-colon need to be escaped with a '\' character, and need to be enclosed within quotes. You can escape a '\' with a '\' too. </p>
			
 
				-  <p><em>-Ckey1=value1 -Ckey2=value2</em><br />
			
 
				-    Provides configuration parameters for the client from where jobs can be submitted. A hadoop-site.xml is generated with these values on the submit node. <br />
			
 
				-    <em>Note:</em> Values which have the following characters: space, comma, equal-to, semi-colon need to be escaped with a '\' character, and need to be enclosed within quotes. You can escape a '\' with a '\' too. </p>
			
 
				-  <p><em>--section-name.option-name=value</em><br />
			
 
				-    This is the method to provide options using the <em>long</em> format. For e.g. you could say <em>--hod.script-wait-time=20</em></p>
			
 
				-		</section>
			
 
				-	</section>
			
 
				-	<section>
			
 
				-	  <title> Troubleshooting </title><anchor id="Troubleshooting"></anchor>
			
 
				-  <p>The following section identifies some of the most likely error conditions users can run into when using HOD and ways to trouble-shoot them</p>
			
 
				-  <section><title><code>hod</code> Hangs During Allocation </title><anchor id="_hod_Hangs_During_Allocation"></anchor><anchor id="hod_Hangs_During_Allocation"></anchor>
			
 
				-  <p><em>Possible Cause:</em> One of the HOD or Hadoop components have failed to come up. In such a case, the <code>hod</code> command will return after a few minutes (typically 2-3 minutes) with an error code of either 7 or 8 as defined in the Error Codes section. Refer to that section for further details. </p>
			
 
				-  <p><em>Possible Cause:</em> A large allocation is fired with a tarball. Sometimes due to load in the network, or on the allocated nodes, the tarball distribution might be significantly slow and take a couple of minutes to come back. Wait for completion. Also check that the tarball does not have the Hadoop sources or documentation.</p>
			
 
				-  <p><em>Possible Cause:</em> A Torque related problem. If the cause is Torque related, the <code>hod</code> command will not return for more than 5 minutes. Running <code>hod</code> in debug mode may show the <code>qstat</code> command being executed repeatedly. Executing the <code>qstat</code> command from a separate shell may show that the job is in the <code>Q</code> (Queued) state. This usually indicates a problem with Torque. Possible causes could include some nodes being down, or new nodes added that Torque is not aware of. Generally, system administator help is needed to resolve this problem.</p>
			
 
				-    </section>
			
 
				-  <section><title><code>hod</code> Hangs During Deallocation </title><anchor id="_hod_Hangs_During_Deallocation"></anchor><anchor id="hod_Hangs_During_Deallocation"></anchor>
			
 
				-  <p><em>Possible Cause:</em> A Torque related problem, usually load on the Torque server, or the allocation is very large. Generally, waiting for the command to complete is the only option.</p>
			
 
				-  </section>
			
 
				-  <section><title><code>hod</code> Fails With an Error Code and Error Message </title><anchor id="hod_Fails_With_an_error_code_and"></anchor><anchor id="_hod_Fails_With_an_error_code_an"></anchor>
			
 
				-  <p>If the exit code of the <code>hod</code> command is not <code>0</code>, then refer to the following table of error exit codes to determine why the code may have occurred and how to debug the situation.</p>
			
 
				-  <p><strong> Error Codes </strong></p><anchor id="Error_Codes"></anchor>
			
 
				-  <table>
			
 
				-    
			
 
				-      <tr>
			
 
				-        <th>Error Code</th>
			
 
				-        <th>Meaning</th>
			
 
				-        <th>Possible Causes and Remedial Actions</th>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 1 </td>
			
 
				-        <td> Configuration error </td>
			
 
				-        <td> Incorrect configuration values specified in hodrc, or other errors related to HOD configuration. The error messages in this case must be sufficient to debug and fix the problem. </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 2 </td>
			
 
				-        <td> Invalid operation </td>
			
 
				-        <td> Do <code>hod help</code> for the list of valid operations. </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 3 </td>
			
 
				-        <td> Invalid operation arguments </td>
			
 
				-        <td> Do <code>hod help operation</code> for listing the usage of a particular operation.</td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 4 </td>
			
 
				-        <td> Scheduler failure </td>
			
 
				-        <td> 1. Requested more resources than available. Run <code>checknodes cluster_name</code> to see if enough nodes are available. <br />
			
 
				-          2. Requested resources exceed resource manager limits. <br />
			
 
				-          3. Torque is misconfigured, the path to Torque binaries is misconfigured, or other Torque problems. Contact system administrator. </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 5 </td>
			
 
				-        <td> Job execution failure </td>
			
 
				-        <td> 1. Torque Job was deleted from outside. Execute the Torque <code>qstat</code> command to see if you have any jobs in the <code>R</code> (Running) state. If none exist, try re-executing HOD. <br />
			
 
				-          2. Torque problems such as the server momentarily going down, or becoming unresponsive. Contact system administrator. <br/>
			
 
				-          3. The system administrator might have configured account verification, and an invalid account is specified. Contact system administrator.</td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 6 </td>
			
 
				-        <td> Ringmaster failure </td>
			
 
				-        <td> HOD prints the message "Cluster could not be allocated because of the following errors on the ringmaster host &lt;hostname&gt;". The actual error message may indicate one of the following:<br/>
			
 
				-          1. Invalid configuration on the node running the ringmaster, specified by the hostname in the error message.<br/>
			
 
				-          2. Invalid configuration in the <code>ringmaster</code> section,<br />
			
 
				-          3. Invalid <code>pkgs</code> option in <code>gridservice-mapred or gridservice-hdfs</code> section,<br />
			
 
				-          4. An invalid hadoop tarball, or a tarball which has bundled an invalid configuration file in the conf directory,<br />
			
 
				-          5. Mismatched version in Hadoop between the MapReduce and an external HDFS.<br />
			
 
				-          The Torque <code>qstat</code> command will most likely show a job in the <code>C</code> (Completed) state. <br/>
			
 
				-          One can login to the ringmaster host as given by HOD failure message and debug the problem with the help of the error message. If the error message doesn't give complete information, ringmaster logs should help finding out the root cause of the problem. Refer to the section <em>Locating Ringmaster Logs</em> below for more information. </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 7 </td>
			
 
				-        <td> HDFS failure </td>
			
 
				-        <td> When HOD fails to allocate due to HDFS failures (or Job tracker failures, error code 8, see below), it prints a failure message "Hodring at &lt;hostname&gt; failed with following errors:" and then gives the actual error message, which may indicate one of the following:<br/>
			
 
				-          1. Problem in starting Hadoop clusters. Usually the actual cause in the error message will indicate the problem on the hostname mentioned. Also, review the Hadoop related configuration in the HOD configuration files. Look at the Hadoop logs using information specified in <em>Collecting and Viewing Hadoop Logs</em> section above. <br />
			
 
				-          2. Invalid configuration on the node running the hodring, specified by the hostname in the error message <br/>
			
 
				-          3. Invalid configuration in the <code>hodring</code> section of hodrc. <code>ssh</code> to the hostname specified in the error message and grep for <code>ERROR</code> or <code>CRITICAL</code> in hodring logs. Refer to the section <em>Locating Hodring Logs</em> below for more information. <br />
			
 
				-          4. Invalid tarball specified which is not packaged correctly. <br />
			
 
				-          5. Cannot communicate with an externally configured HDFS.<br/>
			
 
				-          When such HDFS or Job tracker failure occurs, one can login into the host with hostname mentioned in HOD failure message and debug the problem. While fixing the problem, one should also review other log messages in the ringmaster log to see which other machines also might have had problems bringing up the jobtracker/namenode, apart from the hostname that is reported in the failure message. This possibility of other machines also having problems occurs because HOD continues to try and launch hadoop daemons on multiple machines one after another depending upon the value of the configuration variable <a href="hod_config_guide.html#3.4+ringmaster+options">ringmaster.max-master-failures</a>. Refer to the section <em>Locating Ringmaster Logs</em> below to find more about ringmaster logs.
			
 
				-          </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 8 </td>
			
 
				-        <td> Job tracker failure </td>
			
 
				-        <td> Similar to the causes in <em>DFS failure</em> case. </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 10 </td>
			
 
				-        <td> Cluster dead </td>
			
 
				-        <td> 1. Cluster was auto-deallocated because it was idle for a long time. <br />
			
 
				-          2. Cluster was auto-deallocated because the wallclock time specified by the system administrator or user was exceeded. <br />
			
 
				-          3. Cannot communicate with the JobTracker and HDFS NameNode which were successfully allocated. Deallocate the cluster, and allocate again. </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 12 </td>
			
 
				-        <td> Cluster already allocated </td>
			
 
				-        <td> The cluster directory specified has been used in a previous allocate operation and is not yet deallocated. Specify a different directory, or deallocate the previous allocation first. </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 13 </td>
			
 
				-        <td> HDFS dead </td>
			
 
				-        <td> Cannot communicate with the HDFS NameNode. HDFS NameNode went down. </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 14 </td>
			
 
				-        <td> Mapred dead </td>
			
 
				-        <td> 1. Cluster was auto-deallocated because it was idle for a long time. <br />
			
 
				-          2. Cluster was auto-deallocated because the wallclock time specified by the system administrator or user was exceeded. <br />
			
 
				-          3. Cannot communicate with the Map/Reduce JobTracker. JobTracker node went down. <br />
			
 
				-          </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td> 15 </td>
			
 
				-        <td> Cluster not allocated </td>
			
 
				-        <td> An operation which requires an allocated cluster is given a cluster directory with no state information. </td>
			
 
				-      </tr>
			
 
				-   
			
 
				-      <tr>
			
 
				-        <td> Any non-zero exit code </td>
			
 
				-        <td> HOD script error </td>
			
 
				-        <td> If the hod script option was used, it is likely that the exit code is from the script. Unfortunately, this could clash with the exit codes of the hod command itself. In order to help users differentiate these two, hod writes the script's exit code to a file called script.exitcode in the cluster directory, if the script returned an exit code. You can cat this file to determine the script's exit code. If it does not exist, then it is a hod command exit code.</td> 
			
 
				-      </tr>
			
 
				-  </table>
			
 
				-    </section>
			
 
				-  <section><title>Hadoop DFSClient Warns with a
			
 
				-  NotReplicatedYetException</title>
			
 
				-  <p>Sometimes, when you try to upload a file to the HDFS immediately after
			
 
				-  allocating a HOD cluster, DFSClient warns with a NotReplicatedYetException. It
			
 
				-  usually shows a message something like - </p><table><tr><td><code>WARN
			
 
				-  hdfs.DFSClient: NotReplicatedYetException sleeping &lt;filename&gt; retries
			
 
				-  left 3</code></td></tr><tr><td><code>08/01/25 16:31:40 INFO hdfs.DFSClient:
			
 
				-  org.apache.hadoop.ipc.RemoteException: java.io.IOException: File
			
 
				-  &lt;filename&gt; could only be replicated to 0 nodes, instead of
			
 
				-  1</code></td></tr></table><p> This scenario arises when you try to upload a file
			
 
				-  to the HDFS while the DataNodes are still in the process of contacting the
			
 
				-  NameNode. This can be resolved by waiting for some time before uploading a new
			
 
				-  file to the HDFS, so that enough DataNodes start and contact the
			
 
				-  NameNode.</p>
			
 
				-  </section>
			
 
				-  <section><title> Hadoop Jobs Not Running on a Successfully Allocated Cluster </title><anchor id="Hadoop_Jobs_Not_Running_on_a_Suc"></anchor>
			
 
				-  <p>This scenario generally occurs when a cluster is allocated, and is left inactive for sometime, and then hadoop jobs are attempted to be run on them. Then Hadoop jobs fail with the following exception:</p>
			
 
				-  <table><tr><td><code>08/01/25 16:31:40 INFO ipc.Client: Retrying connect to server: foo.bar.com/1.1.1.1:53567. Already tried 1 time(s).</code></td></tr></table>
			
 
				-  <p><em>Possible Cause:</em> No Hadoop jobs were run for a significant portion of time. Thus the cluster would have got deallocated as described in the section <em>Auto-deallocation of Idle Clusters</em>. Deallocate the cluster and allocate it again.</p>
			
 
				-  <p><em>Possible Cause:</em> The wallclock limit specified by the Torque administrator or the <code>-l</code> option defined in the section <em>Specifying Additional Job Attributes</em> was exceeded since allocation time. Thus the cluster would have got released. Deallocate the cluster and allocate it again.</p>
			
 
				-  <p><em>Possible Cause:</em> There is a version mismatch between the version of the hadoop being used in provisioning (typically via the tarball option) and the external HDFS. Ensure compatible versions are being used.</p>
			
 
				-  <p><em>Possible Cause:</em> There is a version mismatch between the version of the hadoop client being used to submit jobs and the hadoop used in provisioning (typically via the tarball option). Ensure compatible versions are being used.</p>
			
 
				-  <p><em>Possible Cause:</em> You used one of the options for specifying Hadoop configuration <code>-M or -H</code>, which had special characters like space or comma that were not escaped correctly. Refer to the section <em>Options Configuring HOD</em> for checking how to specify such options correctly.</p>
			
 
				-    </section>
			
 
				-  <section><title> My Hadoop Job Got Killed </title><anchor id="My_Hadoop_Job_Got_Killed"></anchor>
			
 
				-  <p><em>Possible Cause:</em> The wallclock limit specified by the Torque administrator or the <code>-l</code> option defined in the section <em>Specifying Additional Job Attributes</em> was exceeded since allocation time. Thus the cluster would have got released. Deallocate the cluster and allocate it again, this time with a larger wallclock time.</p>
			
 
				-  <p><em>Possible Cause:</em> Problems with the JobTracker node. Refer to the section in <em>Collecting and Viewing Hadoop Logs</em> to get more information.</p>
			
 
				-    </section>
			
 
				-  <section><title> Hadoop Job Fails with Message: 'Job tracker still initializing' </title><anchor id="Hadoop_Job_Fails_with_Message_Jo"></anchor>
			
 
				-  <p><em>Possible Cause:</em> The hadoop job was being run as part of the HOD script command, and it started before the JobTracker could come up fully. Allocate the cluster using a large value for the configuration option <code>--hod.script-wait-time</code>. Typically a value of 120 should work, though it is typically unnecessary to be that large.</p>
			
 
				-    </section>
			
 
				-  <section><title> The Exit Codes For HOD Are Not Getting Into Torque </title><anchor id="The_Exit_Codes_For_HOD_Are_Not_G"></anchor>
			
 
				-  <p><em>Possible Cause:</em> Version 0.16 of hadoop is required for this functionality to work. The version of Hadoop used does not match. Use the required version of Hadoop.</p>
			
 
				-  <p><em>Possible Cause:</em> The deallocation was done without using the <code>hod</code> command; for e.g. directly using <code>qdel</code>. When the cluster is deallocated in this manner, the HOD processes are terminated using signals. This results in the exit code to be based on the signal number, rather than the exit code of the program.</p>
			
 
				-    </section>
			
 
				-  <section><title> The Hadoop Logs are Not Uploaded to HDFS </title><anchor id="The_Hadoop_Logs_are_Not_Uploaded"></anchor>
			
 
				-  <p><em>Possible Cause:</em> There is a version mismatch between the version of the hadoop being used for uploading the logs and the external HDFS. Ensure that the correct version is specified in the <code>hodring.pkgs</code> option.</p>
			
 
				-    </section>
			
 
				-  <section><title> Locating Ringmaster Logs </title><anchor id="Locating_Ringmaster_Logs"></anchor>
			
 
				-  <p>To locate the ringmaster logs, follow these steps: </p>
			
 
				-  <ul>
			
 
				-    <li> Execute hod in the debug mode using the -b option. This will print the Torque job id for the current run.</li>
			
 
				-    <li> Execute <code>qstat -f torque_job_id</code> and look up the value of the <code>exec_host</code> parameter in the output. The first host in this list is the ringmaster node.</li>
			
 
				-    <li> Login to this node.</li>
			
 
				-    <li> The ringmaster log location is specified by the <code>ringmaster.log-dir</code> option in the hodrc. The name of the log file will be <code>username.torque_job_id/ringmaster-main.log</code>.</li>
			
 
				-    <li> If you don't get enough information, you may want to set the ringmaster debug level to 4. This can be done by passing <code>--ringmaster.debug 4</code> to the hod command line.</li>
			
 
				-  </ul>
			
 
				-  </section>
			
 
				-  <section><title> Locating Hodring Logs </title><anchor id="Locating_Hodring_Logs"></anchor>
			
 
				-  <p>To locate hodring logs, follow the steps below: </p>
			
 
				-  <ul>
			
 
				-    <li> Execute hod in the debug mode using the -b option. This will print the Torque job id for the current run.</li>
			
 
				-    <li> Execute <code>qstat -f torque_job_id</code> and look up the value of the <code>exec_host</code> parameter in the output. All nodes in this list should have a hodring on them.</li>
			
 
				-    <li> Login to any of these nodes.</li>
			
 
				-    <li> The hodring log location is specified by the <code>hodring.log-dir</code> option in the hodrc. The name of the log file will be <code>username.torque_job_id/hodring-main.log</code>.</li>
			
 
				-    <li> If you don't get enough information, you may want to set the hodring debug level to 4. This can be done by passing <code>--hodring.debug 4</code> to the hod command line.</li>
			
 
				-  </ul>
			
 
				-  </section>
			
 
				-	</section>
			
 
				-</body>
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/index.xml
+++ b/src/docs/src/documentation/content/xdocs/index.xml
@@ -19,19 +19,28 @@
 
				 <!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				 
			
 
				 <document>
			
 
				-  
			
 
				   <header>
			
 
				     <title>Overview</title>
			
 
				   </header>
			
 
				-  
			
 
				   <body>
			
 
				+  
			
 
				   <p>
			
 
				-  The Hadoop Documentation provides the information you need to get started using Hadoop, the Hadoop Distributed File System (HDFS), and Hadoop on Demand (HOD).
			
 
				-  </p><p>
			
 
				-Begin with the <a href="quickstart.html">Hadoop Quick Start</a> which shows you how to set up a single-node Hadoop installation. Then move on to the <a href="cluster_setup.html">Hadoop Cluster Setup</a> to learn how to set up a multi-node Hadoop installation. Once your Hadoop installation is in place, try out the <a href="mapred_tutorial.html">Hadoop Map/Reduce Tutorial</a>. 
			
 
				-  </p><p>
			
 
				-If you have more questions, you can ask on the <a href="ext:lists">Hadoop Core Mailing Lists</a> or browse the <a href="ext:archive">Mailing List Archives</a>.
			
 
				-    </p>
			
 
				-  </body>
			
 
				+  The HDFS Documentation provides the information you need to get started using the Hadoop Distributed File System. 
			
 
				+  Begin with the <a href="hdfs_user_guide.html">HDFS Users Guide</a> to obtain an overview of the system and then
			
 
				+  move on to the <a href="hdfs_design.html">HDFS Architecture Guide</a> for more detailed information.
			
 
				+  </p>
			
 
				   
			
 
				+  <p>
			
 
				+   HDFS commonly works in tandem with a cluster environment and MapReduce applications. 
			
 
				+   For information about Hadoop clusters (single or multi node) see the 
			
 
				+ <a href="http://hadoop.apache.org/common/docs/current/index.html">Hadoop Common Documentation</a>.
			
 
				+   For information about MapReduce see the 
			
 
				+ <a href="http://hadoop.apache.org/mapreduce/docs/current/index.html">MapReduce Documentation</a>.
			
 
				+  </p>   
			
 
				+  
			
 
				+<p>
			
 
				+If you have more questions, you can ask on the <a href="ext:lists">HDFS Mailing Lists</a> or browse the <a href="ext:archive">Mailing List Archives</a>.
			
 
				+</p>
			
 
				+
			
 
				+</body>
			
 
				 </document>
			
--- a/src/docs/src/documentation/content/xdocs/libhdfs.xml
+++ b/src/docs/src/documentation/content/xdocs/libhdfs.xml
@@ -19,20 +19,22 @@
 
				 <!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"

			
 
				           "http://forrest.apache.org/dtd/document-v20.dtd">

			
 
				 

			
 
				-

			
 
				 <document>

			
 
				 <header>

			
 
				-<title>C API to HDFS: libhdfs</title>

			
 
				+<title>C API libhdfs</title>

			
 
				 <meta name="http-equiv">Content-Type</meta>

			
 
				 <meta name="content">text/html;</meta>

			
 
				 <meta name="charset">utf-8</meta>

			
 
				 </header>

			
 
				 <body>

			
 
				 <section>

			
 
				-<title>C API to HDFS: libhdfs</title>

			
 
				+<title>Overview</title>

			
 
				 

			
 
				 <p>

			
 
				-libhdfs is a JNI based C api for Hadoop's DFS. It provides C apis to a subset of the HDFS APIs to manipulate DFS files and the filesystem. libhdfs is part of the hadoop distribution and comes pre-compiled in ${HADOOP_HOME}/libhdfs/libhdfs.so .

			
 
				+libhdfs is a JNI based C API for Hadoop's Distributed File System (HDFS).

			
 
				+It provides C APIs to a subset of the HDFS APIs to manipulate HDFS files and

			
 
				+the filesystem. libhdfs is part of the Hadoop distribution and comes 

			
 
				+pre-compiled in ${HADOOP_HOME}/libhdfs/libhdfs.so .

			
 
				 </p>

			
 
				 

			
 
				 </section>

			
@@ -47,7 +49,7 @@ The header file for libhdfs describes each API in detail and is available in ${H
 
				 </p>

			
 
				 </section>

			
 
				 <section>

			
 
				-<title>A sample program</title>

			
 
				+<title>A Sample Program</title>

			
 
				 

			
 
				 <source>

			
 
				 #include "hdfs.h" 

			
@@ -69,29 +71,40 @@ int main(int argc, char **argv) {
 
				     }

			
 
				    hdfsCloseFile(fs, writeFile);

			
 
				 }

			
 
				-

			
 
				 </source>

			
 
				 </section>

			
 
				 

			
 
				 <section>

			
 
				-<title>How to link with the library</title>

			
 
				+<title>How To Link With The Library</title>

			
 
				 <p>

			
 
				-See the Makefile for hdfs_test.c in the libhdfs source directory (${HADOOP_HOME}/src/c++/libhdfs/Makefile) or something like:

			
 
				+See the Makefile for hdfs_test.c in the libhdfs source directory (${HADOOP_HOME}/src/c++/libhdfs/Makefile) or something like:<br />

			
 
				 gcc above_sample.c -I${HADOOP_HOME}/src/c++/libhdfs -L${HADOOP_HOME}/libhdfs -lhdfs -o above_sample

			
 
				 </p>

			
 
				 </section>

			
 
				 <section>

			
 
				-<title>Common problems</title>

			
 
				+<title>Common Problems</title>

			
 
				 <p>

			
 
				-The most common problem is the CLASSPATH is not set properly when calling a program that uses libhdfs. Make sure you set it to all the hadoop jars needed to run Hadoop itself. Currently, there is no way to programmatically generate the classpath, but a good bet is to include all the jar files in ${HADOOP_HOME} and ${HADOOP_HOME}/lib as well as the right configuration directory containing hdfs-site.xml

			
 
				+The most common problem is the CLASSPATH is not set properly when calling a program that uses libhdfs. 

			
 
				+Make sure you set it to all the Hadoop jars needed to run Hadoop itself. Currently, there is no way to 

			
 
				+programmatically generate the classpath, but a good bet is to include all the jar files in ${HADOOP_HOME} 

			
 
				+and ${HADOOP_HOME}/lib as well as the right configuration directory containing hdfs-site.xml

			
 
				 </p>

			
 
				 </section>

			
 
				 <section>

			
 
				-<title>libhdfs is thread safe</title>

			
 
				-<p>Concurrency and Hadoop FS "handles" - the hadoop FS implementation includes a FS handle cache which caches based on the URI of the namenode along with the user connecting. So, all calls to hdfsConnect will return the same handle but calls to hdfsConnectAsUser with different users will return different handles.  But, since HDFS client handles are completely thread safe, this has no bearing on concurrency. 

			
 
				-</p>

			
 
				-<p>Concurrency and libhdfs/JNI - the libhdfs calls to JNI should always be creating thread local storage, so (in theory), libhdfs should be as thread safe as the underlying calls to the Hadoop FS.

			
 
				-</p>

			
 
				+<title>Thread Safe</title>

			
 
				+<p>libdhfs is thread safe.</p>

			
 
				+<ul>

			
 
				+<li>Concurrency and Hadoop FS "handles" 

			
 
				+<br />The Hadoop FS implementation includes a FS handle cache which caches based on the URI of the 

			
 
				+namenode along with the user connecting. So, all calls to hdfsConnect will return the same handle but 

			
 
				+calls to hdfsConnectAsUser with different users will return different handles.  But, since HDFS client 

			
 
				+handles are completely thread safe, this has no bearing on concurrency. 

			
 
				+</li>

			
 
				+<li>Concurrency and libhdfs/JNI 

			
 
				+<br />The libhdfs calls to JNI should always be creating thread local storage, so (in theory), libhdfs 

			
 
				+should be as thread safe as the underlying calls to the Hadoop FS.

			
 
				+</li>

			
 
				+</ul>

			
 
				 </section>

			
 
				 </body>

			
 
				 </document>

			
--- a/src/docs/src/documentation/content/xdocs/mapred_tutorial.xml
+++ b/src/docs/src/documentation/content/xdocs/mapred_tutorial.xml
@@ -1,3131 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-<document>
			
 
				-  
			
 
				-  <header>
			
 
				-    <title>Map/Reduce Tutorial</title>
			
 
				-  </header>
			
 
				-  
			
 
				-  <body>
			
 
				-  
			
 
				-    <section>
			
 
				-      <title>Purpose</title>
			
 
				-      
			
 
				-      <p>This document comprehensively describes all user-facing facets of the 
			
 
				-      Hadoop Map/Reduce framework and serves as a tutorial.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Pre-requisites</title>
			
 
				-      
			
 
				-      <p>Ensure that Hadoop is installed, configured and is running. More
			
 
				-      details:</p> 
			
 
				-      <ul>
			
 
				-        <li>
			
 
				-          <a href="quickstart.html">Hadoop Quick Start</a> for first-time users.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          <a href="cluster_setup.html">Hadoop Cluster Setup</a> for large, 
			
 
				-          distributed clusters.
			
 
				-        </li>
			
 
				-      </ul>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Overview</title>
			
 
				-      
			
 
				-      <p>Hadoop Map/Reduce is a software framework for easily writing 
			
 
				-      applications which process vast amounts of data (multi-terabyte data-sets) 
			
 
				-      in-parallel on large clusters (thousands of nodes) of commodity 
			
 
				-      hardware in a reliable, fault-tolerant manner.</p>
			
 
				-      
			
 
				-      <p>A Map/Reduce <em>job</em> usually splits the input data-set into 
			
 
				-      independent chunks which are processed by the <em>map tasks</em> in a
			
 
				-      completely parallel manner. The framework sorts the outputs of the maps, 
			
 
				-      which are then input to the <em>reduce tasks</em>. Typically both the 
			
 
				-      input and the output of the job are stored in a file-system. The framework 
			
 
				-      takes care of scheduling tasks, monitoring them and re-executes the failed
			
 
				-      tasks.</p>
			
 
				-      
			
 
				-      <p>Typically the compute nodes and the storage nodes are the same, that is, 
			
 
				-      the Map/Reduce framework and the Hadoop Distributed File System (see <a href="hdfs_design.html">HDFS Architecture </a>) 
			
 
				-      are running on the same set of nodes. This configuration
			
 
				-      allows the framework to effectively schedule tasks on the nodes where data 
			
 
				-      is already present, resulting in very high aggregate bandwidth across the 
			
 
				-      cluster.</p>
			
 
				-      
			
 
				-      <p>The Map/Reduce framework consists of a single master 
			
 
				-      <code>JobTracker</code> and one slave <code>TaskTracker</code> per 
			
 
				-      cluster-node. The master is responsible for scheduling the jobs' component 
			
 
				-      tasks on the slaves, monitoring them and re-executing the failed tasks. The 
			
 
				-      slaves execute the tasks as directed by the master.</p>
			
 
				-      
			
 
				-      <p>Minimally, applications specify the input/output locations and supply
			
 
				-      <em>map</em> and <em>reduce</em> functions via implementations of
			
 
				-      appropriate interfaces and/or abstract-classes. These, and other job 
			
 
				-      parameters, comprise the <em>job configuration</em>. The Hadoop 
			
 
				-      <em>job client</em> then submits the job (jar/executable etc.) and 
			
 
				-      configuration to the <code>JobTracker</code> which then assumes the 
			
 
				-      responsibility of distributing the software/configuration to the slaves, 
			
 
				-      scheduling tasks and monitoring them, providing status and diagnostic 
			
 
				-      information to the job-client.</p>
			
 
				-      
			
 
				-      <p>Although the Hadoop framework is implemented in Java<sup>TM</sup>, 
			
 
				-      Map/Reduce applications need not be written in Java.</p>
			
 
				-      <ul>
			
 
				-        <li>
			
 
				-          <a href="ext:api/org/apache/hadoop/streaming/package-summary">
			
 
				-          Hadoop Streaming</a> is a utility which allows users to create and run 
			
 
				-          jobs with any executables (e.g. shell utilities) as the mapper and/or 
			
 
				-          the reducer.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/pipes/package-summary">
			
 
				-          Hadoop Pipes</a> is a <a href="http://www.swig.org/">SWIG</a>-
			
 
				-          compatible <em>C++ API</em> to implement Map/Reduce applications (non 
			
 
				-          JNI<sup>TM</sup> based).
			
 
				-        </li>
			
 
				-      </ul>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Inputs and Outputs</title>
			
 
				-
			
 
				-      <p>The Map/Reduce framework operates exclusively on 
			
 
				-      <code>&lt;key, value&gt;</code> pairs, that is, the framework views the 
			
 
				-      input to the job as a set of <code>&lt;key, value&gt;</code> pairs and 
			
 
				-      produces a set of <code>&lt;key, value&gt;</code> pairs as the output of 
			
 
				-      the job, conceivably of different types.</p> 
			
 
				-      
			
 
				-      <p>The <code>key</code> and <code>value</code> classes have to be 
			
 
				-      serializable by the framework and hence need to implement the 
			
 
				-      <a href="ext:api/org/apache/hadoop/io/writable">Writable</a> 
			
 
				-      interface. Additionally, the <code>key</code> classes have to implement the
			
 
				-      <a href="ext:api/org/apache/hadoop/io/writablecomparable">
			
 
				-      WritableComparable</a> interface to facilitate sorting by the framework.
			
 
				-      </p>
			
 
				-
			
 
				-      <p>Input and Output types of a Map/Reduce job:</p>
			
 
				-      <p>
			
 
				-        (input) <code>&lt;k1, v1&gt;</code> 
			
 
				-        -&gt; 
			
 
				-        <strong>map</strong> 
			
 
				-        -&gt; 
			
 
				-        <code>&lt;k2, v2&gt;</code> 
			
 
				-        -&gt; 
			
 
				-        <strong>combine</strong> 
			
 
				-        -&gt; 
			
 
				-        <code>&lt;k2, v2&gt;</code> 
			
 
				-        -&gt; 
			
 
				-        <strong>reduce</strong> 
			
 
				-        -&gt; 
			
 
				-        <code>&lt;k3, v3&gt;</code> (output)
			
 
				-      </p>
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Example: WordCount v1.0</title>
			
 
				-      
			
 
				-      <p>Before we jump into the details, lets walk through an example Map/Reduce 
			
 
				-      application to get a flavour for how they work.</p>
			
 
				-      
			
 
				-      <p><code>WordCount</code> is a simple application that counts the number of
			
 
				-      occurences of each word in a given input set.</p>
			
 
				-      
			
 
				-      <p>This works with a local-standalone, pseudo-distributed or fully-distributed 
			
 
				-      Hadoop installation(see <a href="quickstart.html"> Hadoop Quick Start</a>).</p>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Source Code</title>
			
 
				-        
			
 
				-        <table>
			
 
				-          <tr>
			
 
				-            <th></th>
			
 
				-            <th>WordCount.java</th>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>1.</td>
			
 
				-            <td>
			
 
				-              <code>package org.myorg;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>2.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>3.</td>
			
 
				-            <td>
			
 
				-              <code>import java.io.IOException;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>4.</td>
			
 
				-            <td>
			
 
				-              <code>import java.util.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>5.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>6.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.fs.Path;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>7.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.conf.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>8.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.io.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>9.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.mapred.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>10.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.util.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>11.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>12.</td>
			
 
				-            <td>
			
 
				-              <code>public class WordCount {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>13.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>14.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public static class Map extends MapReduceBase 
			
 
				-                implements Mapper&lt;LongWritable, Text, Text, IntWritable&gt; {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>15.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                private final static IntWritable one = new IntWritable(1);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>16.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>private Text word = new Text();</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>17.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>18.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public void map(LongWritable key, Text value, 
			
 
				-                OutputCollector&lt;Text, IntWritable&gt; output, 
			
 
				-                Reporter reporter) throws IOException {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>19.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>String line = value.toString();</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>20.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>StringTokenizer tokenizer = new StringTokenizer(line);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>21.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>while (tokenizer.hasMoreTokens()) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>22.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>word.set(tokenizer.nextToken());</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>23.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>output.collect(word, one);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>24.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>25.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>26.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>27.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>28.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public static class Reduce extends MapReduceBase implements 
			
 
				-                Reducer&lt;Text, IntWritable, Text, IntWritable&gt; {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>29.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public void reduce(Text key, Iterator&lt;IntWritable&gt; values,
			
 
				-                OutputCollector&lt;Text, IntWritable&gt; output, 
			
 
				-                Reporter reporter) throws IOException {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>30.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>int sum = 0;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>31.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>while (values.hasNext()) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>32.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>sum += values.next().get();</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>33.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>34.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>output.collect(key, new IntWritable(sum));</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>35.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>36.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>37.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>38.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public static void main(String[] args) throws Exception {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>39.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                JobConf conf = new JobConf(WordCount.class);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>40.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setJobName("wordcount");</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>41.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>42.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setOutputKeyClass(Text.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>43.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setOutputValueClass(IntWritable.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>44.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>45.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setMapperClass(Map.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>46.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setCombinerClass(Reduce.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>47.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setReducerClass(Reduce.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>48.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>49.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setInputFormat(TextInputFormat.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>50.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setOutputFormat(TextOutputFormat.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>51.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>52.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>FileInputFormat.setInputPaths(conf, new Path(args[0]));</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>53.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>FileOutputFormat.setOutputPath(conf, new Path(args[1]));</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>54.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>55.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>JobClient.runJob(conf);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>57.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>58.</td>
			
 
				-            <td>
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>59.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-      </section>
			
 
				-        
			
 
				-      <section>
			
 
				-        <title>Usage</title>
			
 
				-        
			
 
				-        <p>Assuming <code>HADOOP_HOME</code> is the root of the installation and 
			
 
				-        <code>HADOOP_VERSION</code> is the Hadoop version installed, compile 
			
 
				-        <code>WordCount.java</code> and create a jar:</p>
			
 
				-        <p>
			
 
				-          <code>$ mkdir wordcount_classes</code><br/>
			
 
				-          <code>
			
 
				-            $ javac -classpath ${HADOOP_HOME}/hadoop-${HADOOP_VERSION}-core.jar 
			
 
				-              -d wordcount_classes WordCount.java
			
 
				-          </code><br/>
			
 
				-          <code>$ jar -cvf /usr/joe/wordcount.jar -C wordcount_classes/ .</code> 
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Assuming that:</p>
			
 
				-        <ul>
			
 
				-          <li>
			
 
				-            <code>/usr/joe/wordcount/input</code>  - input directory in HDFS
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            <code>/usr/joe/wordcount/output</code> - output directory in HDFS
			
 
				-          </li>
			
 
				-        </ul>
			
 
				-        
			
 
				-        <p>Sample text-files as input:</p>
			
 
				-        <p>
			
 
				-          <code>$ bin/hadoop dfs -ls /usr/joe/wordcount/input/</code><br/>
			
 
				-          <code>/usr/joe/wordcount/input/file01</code><br/>
			
 
				-          <code>/usr/joe/wordcount/input/file02</code><br/>
			
 
				-          <br/>
			
 
				-          <code>$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file01</code><br/>
			
 
				-          <code>Hello World Bye World</code><br/>
			
 
				-          <br/>
			
 
				-          <code>$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file02</code><br/>
			
 
				-          <code>Hello Hadoop Goodbye Hadoop</code>
			
 
				-        </p>
			
 
				-
			
 
				-        <p>Run the application:</p>
			
 
				-        <p>
			
 
				-          <code>
			
 
				-            $ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount 
			
 
				-              /usr/joe/wordcount/input /usr/joe/wordcount/output 
			
 
				-          </code>
			
 
				-        </p>
			
 
				-
			
 
				-        <p>Output:</p>
			
 
				-        <p>
			
 
				-          <code>
			
 
				-            $ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000
			
 
				-          </code>
			
 
				-          <br/>
			
 
				-          <code>Bye    1</code><br/>
			
 
				-          <code>Goodbye    1</code><br/>
			
 
				-          <code>Hadoop    2</code><br/>
			
 
				-          <code>Hello    2</code><br/>
			
 
				-          <code>World    2</code><br/>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p> Applications can specify a comma separated list of paths which
			
 
				-        would be present in the current working directory of the task 
			
 
				-        using the option <code>-files</code>. The <code>-libjars</code>
			
 
				-        option allows applications to add jars to the classpaths of the maps
			
 
				-        and reduces. The <code>-archives</code> allows them to pass archives
			
 
				-        as arguments that are unzipped/unjarred and a link with name of the
			
 
				-        jar/zip are created in the current working directory of tasks. More
			
 
				-        details about the command line options are available at 
			
 
				-        <a href="commands_manual.html"> Hadoop Command Guide.</a></p>
			
 
				-        
			
 
				-        <p>Running <code>wordcount</code> example with 
			
 
				-        <code>-libjars</code> and <code>-files</code>:<br/>
			
 
				-        <code> hadoop jar hadoop-examples.jar wordcount -files cachefile.txt 
			
 
				-        -libjars mylib.jar input output </code> 
			
 
				-        </p>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Walk-through</title>
			
 
				-        
			
 
				-        <p>The <code>WordCount</code> application is quite straight-forward.</p>
			
 
				-        
			
 
				-        <p>The <code>Mapper</code> implementation (lines 14-26), via the 
			
 
				-        <code>map</code> method (lines 18-25), processes one line at a time,
			
 
				-        as provided by the specified <code>TextInputFormat</code> (line 49). 
			
 
				-        It then splits the line into tokens separated by whitespaces, via the 
			
 
				-        <code>StringTokenizer</code>, and emits a key-value pair of 
			
 
				-        <code>&lt; &lt;word&gt;, 1&gt;</code>.</p>
			
 
				-        
			
 
				-        <p>
			
 
				-          For the given sample input the first map emits:<br/>
			
 
				-          <code>&lt; Hello, 1&gt;</code><br/>
			
 
				-          <code>&lt; World, 1&gt;</code><br/>
			
 
				-          <code>&lt; Bye, 1&gt;</code><br/>
			
 
				-          <code>&lt; World, 1&gt;</code><br/>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>
			
 
				-          The second map emits:<br/>
			
 
				-          <code>&lt; Hello, 1&gt;</code><br/>
			
 
				-          <code>&lt; Hadoop, 1&gt;</code><br/>
			
 
				-          <code>&lt; Goodbye, 1&gt;</code><br/>
			
 
				-          <code>&lt; Hadoop, 1&gt;</code><br/>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>We'll learn more about the number of maps spawned for a given job, and
			
 
				-        how to control them in a fine-grained manner, a bit later in the 
			
 
				-        tutorial.</p>
			
 
				-        
			
 
				-        <p><code>WordCount</code> also specifies a <code>combiner</code> (line 
			
 
				-        46). Hence, the output of each map is passed through the local combiner 
			
 
				-        (which is same as the <code>Reducer</code> as per the job 
			
 
				-        configuration) for local aggregation, after being sorted on the 
			
 
				-        <em>key</em>s.</p>
			
 
				-
			
 
				-        <p>
			
 
				-          The output of the first map:<br/>
			
 
				-          <code>&lt; Bye, 1&gt;</code><br/>
			
 
				-          <code>&lt; Hello, 1&gt;</code><br/>
			
 
				-          <code>&lt; World, 2&gt;</code><br/>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>
			
 
				-          The output of the second map:<br/>
			
 
				-          <code>&lt; Goodbye, 1&gt;</code><br/>
			
 
				-          <code>&lt; Hadoop, 2&gt;</code><br/>
			
 
				-          <code>&lt; Hello, 1&gt;</code><br/>
			
 
				-        </p>
			
 
				-
			
 
				-        <p>The <code>Reducer</code> implementation (lines 28-36), via the
			
 
				-        <code>reduce</code> method (lines 29-35) just sums up the values,
			
 
				-        which are the occurence counts for each key (i.e. words in this example).
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>
			
 
				-          Thus the output of the job is:<br/>
			
 
				-          <code>&lt; Bye, 1&gt;</code><br/>
			
 
				-          <code>&lt; Goodbye, 1&gt;</code><br/>
			
 
				-          <code>&lt; Hadoop, 2&gt;</code><br/>
			
 
				-          <code>&lt; Hello, 2&gt;</code><br/>
			
 
				-          <code>&lt; World, 2&gt;</code><br/>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>The <code>run</code> method specifies various facets of the job, such 
			
 
				-        as the input/output paths (passed via the command line), key/value 
			
 
				-        types, input/output formats etc., in the <code>JobConf</code>.
			
 
				-        It then calls the <code>JobClient.runJob</code> (line  55) to submit the
			
 
				-        and monitor its progress.</p>
			
 
				-
			
 
				-        <p>We'll learn more about <code>JobConf</code>, <code>JobClient</code>,
			
 
				-        <code>Tool</code> and other interfaces and classes a bit later in the 
			
 
				-        tutorial.</p>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Map/Reduce - User Interfaces</title>
			
 
				-      
			
 
				-      <p>This section provides a reasonable amount of detail on every user-facing 
			
 
				-      aspect of the Map/Reduce framwork. This should help users implement, 
			
 
				-      configure and tune their jobs in a fine-grained manner. However, please 
			
 
				-      note that the javadoc for each class/interface remains the most 
			
 
				-      comprehensive documentation available; this is only meant to be a tutorial.
			
 
				-      </p>
			
 
				-      
			
 
				-      <p>Let us first take the <code>Mapper</code> and <code>Reducer</code> 
			
 
				-      interfaces. Applications typically implement them to provide the 
			
 
				-      <code>map</code> and <code>reduce</code> methods.</p>
			
 
				-      
			
 
				-      <p>We will then discuss other core interfaces including 
			
 
				-      <code>JobConf</code>, <code>JobClient</code>, <code>Partitioner</code>, 
			
 
				-      <code>OutputCollector</code>, <code>Reporter</code>, 
			
 
				-      <code>InputFormat</code>, <code>OutputFormat</code>,
			
 
				-      <code>OutputCommitter</code> and others.</p>
			
 
				-      
			
 
				-      <p>Finally, we will wrap up by discussing some useful features of the
			
 
				-      framework such as the <code>DistributedCache</code>, 
			
 
				-      <code>IsolationRunner</code> etc.</p>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Payload</title>
			
 
				-        
			
 
				-        <p>Applications typically implement the <code>Mapper</code> and 
			
 
				-        <code>Reducer</code> interfaces to provide the <code>map</code> and 
			
 
				-        <code>reduce</code> methods. These form the core of the job.</p>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Mapper</title>
			
 
				-
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/mapper">
			
 
				-          Mapper</a> maps input key/value pairs to a set of intermediate 
			
 
				-          key/value pairs.</p>
			
 
				- 
			
 
				-          <p>Maps are the individual tasks that transform input records into 
			
 
				-          intermediate records. The transformed intermediate records do not need
			
 
				-          to be of the same type as the input records. A given input pair may 
			
 
				-          map to zero or many output pairs.</p> 
			
 
				- 
			
 
				-          <p>The Hadoop Map/Reduce framework spawns one map task for each 
			
 
				-          <code>InputSplit</code> generated by the <code>InputFormat</code> for 
			
 
				-          the job.</p>
			
 
				-          
			
 
				-          <p>Overall, <code>Mapper</code> implementations are passed the 
			
 
				-          <code>JobConf</code> for the job via the 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconfigurable/configure">
			
 
				-          JobConfigurable.configure(JobConf)</a> method and override it to 
			
 
				-          initialize themselves. The framework then calls 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/mapper/map">
			
 
				-          map(WritableComparable, Writable, OutputCollector, Reporter)</a> for 
			
 
				-          each key/value pair in the <code>InputSplit</code> for that task.        
			
 
				-          Applications can then override the
			
 
				-          <a href="ext:api/org/apache/hadoop/io/closeable/close">
			
 
				-          Closeable.close()</a> method to perform any required cleanup.</p>
			
 
				- 
			
 
				-
			
 
				-          <p>Output pairs do not need to be of the same types as input pairs. A 
			
 
				-          given input pair may map to zero or many output pairs.  Output pairs 
			
 
				-          are collected with calls to 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/outputcollector/collect">
			
 
				-          OutputCollector.collect(WritableComparable,Writable)</a>.</p>
			
 
				-
			
 
				-          <p>Applications can use the <code>Reporter</code> to report 
			
 
				-          progress, set application-level status messages and update 
			
 
				-          <code>Counters</code>, or just indicate that they are alive.</p>
			
 
				- 
			
 
				-          <p>All intermediate values associated with a given output key are 
			
 
				-          subsequently grouped by the framework, and passed to the
			
 
				-          <code>Reducer</code>(s) to  determine the final output. Users can 
			
 
				-          control the grouping by specifying a <code>Comparator</code> via 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setoutputkeycomparatorclass">
			
 
				-          JobConf.setOutputKeyComparatorClass(Class)</a>.</p>
			
 
				-
			
 
				-          <p>The <code>Mapper</code> outputs are sorted and then 
			
 
				-          partitioned per <code>Reducer</code>. The total number of partitions is 
			
 
				-          the same as the number of reduce tasks for the job. Users can control 
			
 
				-          which keys (and hence records) go to which <code>Reducer</code> by 
			
 
				-          implementing a custom <code>Partitioner</code>.</p>
			
 
				- 
			
 
				-          <p>Users can optionally specify a <code>combiner</code>, via 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setcombinerclass">
			
 
				-          JobConf.setCombinerClass(Class)</a>, to perform local aggregation of 
			
 
				-          the intermediate outputs, which helps to cut down the amount of data 
			
 
				-          transferred from the <code>Mapper</code> to the <code>Reducer</code>.
			
 
				-          </p>
			
 
				- 
			
 
				-          <p>The intermediate, sorted outputs are always stored in a simple 
			
 
				-          (key-len, key, value-len, value) format. 
			
 
				-          Applications can control if, and how, the 
			
 
				-          intermediate outputs are to be compressed and the 
			
 
				-          <a href="ext:api/org/apache/hadoop/io/compress/compressioncodec">
			
 
				-          CompressionCodec</a> to be used via the <code>JobConf</code>.
			
 
				-          </p>
			
 
				-          
			
 
				-          <section>
			
 
				-            <title>How Many Maps?</title>
			
 
				-             
			
 
				-            <p>The number of maps is usually driven by the total size of the 
			
 
				-            inputs, that is, the total number of blocks of the input files.</p>
			
 
				-  
			
 
				-            <p>The right level of parallelism for maps seems to be around 10-100 
			
 
				-            maps per-node, although it has been set up to 300 maps for very 
			
 
				-            cpu-light map tasks. Task setup takes awhile, so it is best if the 
			
 
				-            maps take at least a minute to execute.</p>
			
 
				- 
			
 
				-            <p>Thus, if you expect 10TB of input data and have a blocksize of 
			
 
				-            <code>128MB</code>, you'll end up with 82,000 maps, unless 
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/jobconf/setnummaptasks">
			
 
				-            setNumMapTasks(int)</a> (which only provides a hint to the framework) 
			
 
				-            is used to set it even higher.</p>
			
 
				-          </section>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Reducer</title>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/reducer">
			
 
				-          Reducer</a> reduces a set of intermediate values which share a key to
			
 
				-          a smaller set of values.</p>
			
 
				-          
			
 
				-          <p>The number of reduces for the job is set by the user 
			
 
				-          via <a href="ext:api/org/apache/hadoop/mapred/jobconf/setnumreducetasks">
			
 
				-          JobConf.setNumReduceTasks(int)</a>.</p>
			
 
				-          
			
 
				-          <p>Overall, <code>Reducer</code> implementations are passed the 
			
 
				-          <code>JobConf</code> for the job via the 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconfigurable/configure">
			
 
				-          JobConfigurable.configure(JobConf)</a> method and can override it to 
			
 
				-          initialize themselves. The framework then calls   
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/reducer/reduce">
			
 
				-          reduce(WritableComparable, Iterator, OutputCollector, Reporter)</a>
			
 
				-          method for each <code>&lt;key, (list of values)&gt;</code> 
			
 
				-          pair in the grouped inputs. Applications can then override the           
			
 
				-          <a href="ext:api/org/apache/hadoop/io/closeable/close">
			
 
				-          Closeable.close()</a> method to perform any required cleanup.</p>
			
 
				-
			
 
				-          <p><code>Reducer</code> has 3 primary phases: shuffle, sort and reduce.
			
 
				-          </p>
			
 
				-          
			
 
				-          <section>
			
 
				-            <title>Shuffle</title>
			
 
				-   
			
 
				-            <p>Input to the <code>Reducer</code> is the sorted output of the
			
 
				-            mappers. In this phase the framework fetches the relevant partition 
			
 
				-            of the output of all the mappers, via HTTP.</p>
			
 
				-          </section>
			
 
				-   
			
 
				-          <section>
			
 
				-            <title>Sort</title>
			
 
				-   
			
 
				-            <p>The framework groups <code>Reducer</code> inputs by keys (since 
			
 
				-            different mappers may have output the same key) in this stage.</p>
			
 
				-   
			
 
				-            <p>The shuffle and sort phases occur simultaneously; while 
			
 
				-            map-outputs are being fetched they are merged.</p>
			
 
				-      
			
 
				-            <section>
			
 
				-              <title>Secondary Sort</title>
			
 
				-   
			
 
				-              <p>If equivalence rules for grouping the intermediate keys are 
			
 
				-              required to be different from those for grouping keys before 
			
 
				-              reduction, then one may specify a <code>Comparator</code> via 
			
 
				-              <a href="ext:api/org/apache/hadoop/mapred/jobconf/setoutputvaluegroupingcomparator">
			
 
				-              JobConf.setOutputValueGroupingComparator(Class)</a>. Since 
			
 
				-              <a href="ext:api/org/apache/hadoop/mapred/jobconf/setoutputkeycomparatorclass">
			
 
				-              JobConf.setOutputKeyComparatorClass(Class)</a> can be used to 
			
 
				-              control how intermediate keys are grouped, these can be used in 
			
 
				-              conjunction to simulate <em>secondary sort on values</em>.</p>
			
 
				-            </section>
			
 
				-          </section>
			
 
				-   
			
 
				-          <section>   
			
 
				-            <title>Reduce</title>
			
 
				-   
			
 
				-            <p>In this phase the 
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/reducer/reduce">
			
 
				-            reduce(WritableComparable, Iterator, OutputCollector, Reporter)</a>
			
 
				-            method is called for each <code>&lt;key, (list of values)&gt;</code> 
			
 
				-            pair in the grouped inputs.</p>
			
 
				-            
			
 
				-            <p>The output of the reduce task is typically written to the 
			
 
				-            <a href="ext:api/org/apache/hadoop/fs/filesystem">
			
 
				-            FileSystem</a> via 
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/outputcollector/collect">
			
 
				-            OutputCollector.collect(WritableComparable, Writable)</a>.</p>
			
 
				-   
			
 
				-            <p>Applications can use the <code>Reporter</code> to report 
			
 
				-            progress, set application-level status messages and update 
			
 
				-            <code>Counters</code>, or just indicate that they are alive.</p>
			
 
				- 
			
 
				-           <p>The output of the <code>Reducer</code> is <em>not sorted</em>.</p>
			
 
				-          </section>
			
 
				-          
			
 
				-          <section>
			
 
				-            <title>How Many Reduces?</title>
			
 
				- 
			
 
				-            <p>The right number of reduces seems to be <code>0.95</code> or 
			
 
				-            <code>1.75</code> multiplied by (&lt;<em>no. of nodes</em>&gt; * 
			
 
				-            <code>mapred.tasktracker.reduce.tasks.maximum</code>).</p>
			
 
				- 
			
 
				-            <p>With <code>0.95</code> all of the reduces can launch immediately 
			
 
				-            and start transfering map outputs as the maps finish. With 
			
 
				-            <code>1.75</code> the faster nodes will finish their first round of 
			
 
				-            reduces and launch a second wave of reduces doing a much better job 
			
 
				-            of load balancing.</p>
			
 
				- 
			
 
				-            <p>Increasing the number of reduces increases the framework overhead, 
			
 
				-            but increases load balancing and lowers the cost of failures.</p>
			
 
				- 
			
 
				-            <p>The scaling factors above are slightly less than whole numbers to 
			
 
				-            reserve a few reduce slots in the framework for speculative-tasks and
			
 
				-            failed tasks.</p>
			
 
				-          </section>
			
 
				-          
			
 
				-          <section>
			
 
				-            <title>Reducer NONE</title>
			
 
				-            
			
 
				-            <p>It is legal to set the number of reduce-tasks to <em>zero</em> if 
			
 
				-            no reduction is desired.</p>
			
 
				- 
			
 
				-            <p>In this case the outputs of the map-tasks go directly to the
			
 
				-            <code>FileSystem</code>, into the output path set by 
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/fileoutputformat/setoutputpath">
			
 
				-            setOutputPath(Path)</a>. The framework does not sort the 
			
 
				-            map-outputs before writing them out to the <code>FileSystem</code>.
			
 
				-            </p>
			
 
				-          </section>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Partitioner</title>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/partitioner">
			
 
				-          Partitioner</a> partitions the key space.</p>
			
 
				-
			
 
				-          <p>Partitioner controls the partitioning of the keys of the 
			
 
				-          intermediate map-outputs. The key (or a subset of the key) is used to 
			
 
				-          derive the partition, typically by a <em>hash function</em>. The total 
			
 
				-          number of partitions is the same as the number of reduce tasks for the 
			
 
				-          job. Hence this controls which of the <code>m</code> reduce tasks the 
			
 
				-          intermediate key (and hence the record) is sent to for reduction.</p>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/lib/hashpartitioner">
			
 
				-          HashPartitioner</a> is the default <code>Partitioner</code>.</p>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Reporter</title>
			
 
				-        
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/reporter">
			
 
				-          Reporter</a> is a facility for Map/Reduce applications to report 
			
 
				-          progress, set application-level status messages and update 
			
 
				-          <code>Counters</code>.</p>
			
 
				- 
			
 
				-          <p><code>Mapper</code> and <code>Reducer</code> implementations can use 
			
 
				-          the <code>Reporter</code> to report progress or just indicate 
			
 
				-          that they are alive. In scenarios where the application takes a
			
 
				-          significant amount of time to process individual key/value pairs, 
			
 
				-          this is crucial since the framework might assume that the task has 
			
 
				-          timed-out and kill that task. Another way to avoid this is to 
			
 
				-          set the configuration parameter <code>mapred.task.timeout</code> to a
			
 
				-          high-enough value (or even set it to <em>zero</em> for no time-outs).
			
 
				-          </p>
			
 
				-
			
 
				-          <p>Applications can also update <code>Counters</code> using the 
			
 
				-          <code>Reporter</code>.</p>
			
 
				-        </section>
			
 
				-      
			
 
				-        <section>
			
 
				-          <title>OutputCollector</title>
			
 
				-        
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/outputcollector">
			
 
				-          OutputCollector</a> is a generalization of the facility provided by
			
 
				-          the Map/Reduce framework to collect data output by the 
			
 
				-          <code>Mapper</code> or the <code>Reducer</code> (either the 
			
 
				-          intermediate outputs or the output of the job).</p>
			
 
				-        </section>
			
 
				-      
			
 
				-        <p>Hadoop Map/Reduce comes bundled with a 
			
 
				-        <a href="ext:api/org/apache/hadoop/mapred/lib/package-summary">
			
 
				-        library</a> of generally useful mappers, reducers, and partitioners.</p>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Job Configuration</title>
			
 
				-        
			
 
				-        <p><a href="ext:api/org/apache/hadoop/mapred/jobconf">
			
 
				-        JobConf</a> represents a Map/Reduce job configuration.</p>
			
 
				- 
			
 
				-        <p><code>JobConf</code> is the primary interface for a user to describe
			
 
				-        a Map/Reduce job to the Hadoop framework for execution. The framework 
			
 
				-        tries to faithfully execute the job as described by <code>JobConf</code>, 
			
 
				-        however:</p> 
			
 
				-        <ul>
			
 
				-          <li>f
			
 
				-            Some configuration parameters may have been marked as 
			
 
				-            <a href="ext:api/org/apache/hadoop/conf/configuration/final_parameters">
			
 
				-            final</a> by administrators and hence cannot be altered.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            While some job parameters are straight-forward to set (e.g. 
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/jobconf/setnumreducetasks">
			
 
				-            setNumReduceTasks(int)</a>), other parameters interact subtly with 
			
 
				-            the rest of the framework and/or job configuration and are 
			
 
				-            more complex to set (e.g. 
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/jobconf/setnummaptasks">
			
 
				-            setNumMapTasks(int)</a>).
			
 
				-          </li>
			
 
				-        </ul>
			
 
				- 
			
 
				-        <p><code>JobConf</code> is typically used to specify the 
			
 
				-        <code>Mapper</code>, combiner (if any), <code>Partitioner</code>, 
			
 
				-        <code>Reducer</code>, <code>InputFormat</code>, 
			
 
				-        <code>OutputFormat</code> and <code>OutputCommitter</code> 
			
 
				-        implementations. <code>JobConf</code> also 
			
 
				-        indicates the set of input files 
			
 
				-        (<a href="ext:api/org/apache/hadoop/mapred/fileinputformat/setinputpaths">setInputPaths(JobConf, Path...)</a>
			
 
				-        /<a href="ext:api/org/apache/hadoop/mapred/fileinputformat/addinputpath">addInputPath(JobConf, Path)</a>)
			
 
				-        and (<a href="ext:api/org/apache/hadoop/mapred/fileinputformat/setinputpathstring">setInputPaths(JobConf, String)</a>
			
 
				-        /<a href="ext:api/org/apache/hadoop/mapred/fileinputformat/addinputpathstring">addInputPaths(JobConf, String)</a>)
			
 
				-        and where the output files should be written
			
 
				-        (<a href="ext:api/org/apache/hadoop/mapred/fileoutputformat/setoutputpath">setOutputPath(Path)</a>).</p>
			
 
				-
			
 
				-        <p>Optionally, <code>JobConf</code> is used to specify other advanced 
			
 
				-        facets of the job such as the <code>Comparator</code> to be used, files 
			
 
				-        to be put in the <code>DistributedCache</code>, whether intermediate 
			
 
				-        and/or job outputs are to be compressed (and how), debugging via 
			
 
				-        user-provided scripts
			
 
				-        (<a href="ext:api/org/apache/hadoop/mapred/jobconf/setmapdebugscript">setMapDebugScript(String)</a>/<a href="ext:api/org/apache/hadoop/mapred/jobconf/setreducedebugscript">setReduceDebugScript(String)</a>) 
			
 
				-        , whether job tasks can be executed in a <em>speculative</em> manner 
			
 
				-        (<a href="ext:api/org/apache/hadoop/mapred/jobconf/setmapspeculativeexecution">setMapSpeculativeExecution(boolean)</a>)/(<a href="ext:api/org/apache/hadoop/mapred/jobconf/setreducespeculativeexecution">setReduceSpeculativeExecution(boolean)</a>)
			
 
				-        , maximum number of attempts per task
			
 
				-        (<a href="ext:api/org/apache/hadoop/mapred/jobconf/setmaxmapattempts">setMaxMapAttempts(int)</a>/<a href="ext:api/org/apache/hadoop/mapred/jobconf/setmaxreduceattempts">setMaxReduceAttempts(int)</a>) 
			
 
				-        , percentage of tasks failure which can be tolerated by the job
			
 
				-        (<a href="ext:api/org/apache/hadoop/mapred/jobconf/setmaxmaptaskfailurespercent">setMaxMapTaskFailuresPercent(int)</a>/<a href="ext:api/org/apache/hadoop/mapred/jobconf/setmaxreducetaskfailurespercent">setMaxReduceTaskFailuresPercent(int)</a>) 
			
 
				-        etc.</p>
			
 
				-        
			
 
				-        <p>Of course, users can use 
			
 
				-        <a href="ext:api/org/apache/hadoop/conf/configuration/set">set(String, String)</a>/<a href="ext:api/org/apache/hadoop/conf/configuration/get">get(String, String)</a>
			
 
				-        to set/get arbitrary parameters needed by applications. However, use the 
			
 
				-        <code>DistributedCache</code> for large amounts of (read-only) data.</p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Task Execution &amp; Environment</title>
			
 
				-
			
 
				-        <p>The <code>TaskTracker</code> executes the <code>Mapper</code>/ 
			
 
				-        <code>Reducer</code>  <em>task</em> as a child process in a separate jvm.
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>The child-task inherits the environment of the parent 
			
 
				-        <code>TaskTracker</code>. The user can specify additional options to the
			
 
				-        child-jvm via the <code>mapred.child.java.opts</code> configuration
			
 
				-        parameter in the <code>JobConf</code> such as non-standard paths for the 
			
 
				-        run-time linker to search shared libraries via 
			
 
				-        <code>-Djava.library.path=&lt;&gt;</code> etc. If the 
			
 
				-        <code>mapred.child.java.opts</code> contains the symbol <em>@taskid@</em> 
			
 
				-        it is interpolated with value of <code>taskid</code> of the map/reduce
			
 
				-        task.</p>
			
 
				-        
			
 
				-        <p>Here is an example with multiple arguments and substitutions, 
			
 
				-        showing jvm GC logging, and start of a passwordless JVM JMX agent so that
			
 
				-        it can connect with jconsole and the likes to watch child memory, 
			
 
				-        threads and get thread dumps. It also sets the maximum heap-size of the 
			
 
				-        child jvm to 512MB and adds an additional path to the 
			
 
				-        <code>java.library.path</code> of the child-jvm.</p>
			
 
				-
			
 
				-        <p>
			
 
				-          <code>&lt;property&gt;</code><br/>
			
 
				-          &nbsp;&nbsp;<code>&lt;name&gt;mapred.child.java.opts&lt;/name&gt;</code><br/>
			
 
				-          &nbsp;&nbsp;<code>&lt;value&gt;</code><br/>
			
 
				-          &nbsp;&nbsp;&nbsp;&nbsp;<code>
			
 
				-                    -Xmx512M -Djava.library.path=/home/mycompany/lib
			
 
				-                    -verbose:gc -Xloggc:/tmp/@taskid@.gc</code><br/>
			
 
				-          &nbsp;&nbsp;&nbsp;&nbsp;<code>
			
 
				-                    -Dcom.sun.management.jmxremote.authenticate=false 
			
 
				-                    -Dcom.sun.management.jmxremote.ssl=false</code><br/>
			
 
				-          &nbsp;&nbsp;<code>&lt;/value&gt;</code><br/>
			
 
				-          <code>&lt;/property&gt;</code>
			
 
				-        </p>
			
 
				-        
			
 
				-        <section>
			
 
				-        <title> Memory management</title>
			
 
				-        <p>Users/admins can also specify the maximum virtual memory 
			
 
				-        of the launched child-task, and any sub-process it launches 
			
 
				-        recursively, using <code>mapred.child.ulimit</code>. Note that
			
 
				-        the value set here is a per process limit.
			
 
				-        The value for <code>mapred.child.ulimit</code> should be specified 
			
 
				-        in kilo bytes (KB). And also the value must be greater than
			
 
				-        or equal to the -Xmx passed to JavaVM, else the VM might not start. 
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Note: <code>mapred.child.java.opts</code> are used only for 
			
 
				-        configuring the launched child tasks from task tracker. Configuring 
			
 
				-        the memory options for daemons is documented in 
			
 
				-        <a href="cluster_setup.html#Configuring+the+Environment+of+the+Hadoop+Daemons">
			
 
				-        cluster_setup.html </a></p>
			
 
				-        
			
 
				-        <p>The memory available to some parts of the framework is also
			
 
				-        configurable. In map and reduce tasks, performance may be influenced
			
 
				-        by adjusting parameters influencing the concurrency of operations and
			
 
				-        the frequency with which data will hit disk. Monitoring the filesystem
			
 
				-        counters for a job- particularly relative to byte counts from the map
			
 
				-        and into the reduce- is invaluable to the tuning of these
			
 
				-        parameters.</p>
			
 
				-        
			
 
				-        <p>Users can choose to override default limits of Virtual Memory and RAM 
			
 
				-          enforced by the task tracker, if memory management is enabled. 
			
 
				-          Users can set the following parameter per job:</p>
			
 
				-           
			
 
				-          <table>
			
 
				-          <tr><th>Name</th><th>Type</th><th>Description</th></tr>
			
 
				-          <tr><td><code>mapred.task.maxvmem</code></td><td>int</td>
			
 
				-            <td>A number, in bytes, that represents the maximum Virtual Memory
			
 
				-            task-limit for each task of the job. A task will be killed if 
			
 
				-            it consumes more Virtual Memory than this number. 
			
 
				-          </td></tr>
			
 
				-          <tr><td>mapred.task.maxpmem</td><td>int</td>
			
 
				-            <td>A number, in bytes, that represents the maximum RAM task-limit
			
 
				-            for each task of the job. This number can be optionally used by
			
 
				-            Schedulers to prevent over-scheduling of tasks on a node based 
			
 
				-            on RAM needs.  
			
 
				-          </td></tr>
			
 
				-        </table>       
			
 
				-        </section>
			
 
				-        <section>
			
 
				-          <title>Map Parameters</title>
			
 
				-
			
 
				-          <p>A record emitted from a map will be serialized into a buffer and
			
 
				-          metadata will be stored into accounting buffers. As described in the
			
 
				-          following options, when either the serialization buffer or the
			
 
				-          metadata exceed a threshold, the contents of the buffers will be
			
 
				-          sorted and written to disk in the background while the map continues
			
 
				-          to output records. If either buffer fills completely while the spill
			
 
				-          is in progress, the map thread will block. When the map is finished,
			
 
				-          any remaining records are written to disk and all on-disk segments
			
 
				-          are merged into a single file. Minimizing the number of spills to
			
 
				-          disk can decrease map time, but a larger buffer also decreases the
			
 
				-          memory available to the mapper.</p>
			
 
				-
			
 
				-          <table>
			
 
				-            <tr><th>Name</th><th>Type</th><th>Description</th></tr>
			
 
				-            <tr><td>io.sort.mb</td><td>int</td>
			
 
				-                <td>The cumulative size of the serialization and accounting
			
 
				-                buffers storing records emitted from the map, in megabytes.
			
 
				-                </td></tr>
			
 
				-            <tr><td>io.sort.record.percent</td><td>float</td>
			
 
				-                <td>The ratio of serialization to accounting space can be
			
 
				-                adjusted. Each serialized record requires 16 bytes of
			
 
				-                accounting information in addition to its serialized size to
			
 
				-                effect the sort. This percentage of space allocated from
			
 
				-                <code>io.sort.mb</code> affects the probability of a spill to
			
 
				-                disk being caused by either exhaustion of the serialization
			
 
				-                buffer or the accounting space. Clearly, for a map outputting
			
 
				-                small records, a higher value than the default will likely
			
 
				-                decrease the number of spills to disk.</td></tr>
			
 
				-            <tr><td>io.sort.spill.percent</td><td>float</td>
			
 
				-                <td>This is the threshold for the accounting and serialization
			
 
				-                buffers. When this percentage of either buffer has filled,
			
 
				-                their contents will be spilled to disk in the background. Let
			
 
				-                <code>io.sort.record.percent</code> be <em>r</em>,
			
 
				-                <code>io.sort.mb</code> be <em>x</em>, and this value be
			
 
				-                <em>q</em>. The maximum number of records collected before the
			
 
				-                collection thread will spill is <code>r * x * q * 2^16</code>.
			
 
				-                Note that a higher value may decrease the number of- or even
			
 
				-                eliminate- merges, but will also increase the probability of
			
 
				-                the map task getting blocked. The lowest average map times are
			
 
				-                usually obtained by accurately estimating the size of the map
			
 
				-                output and preventing multiple spills.</td></tr>
			
 
				-          </table>
			
 
				-
			
 
				-          <p>Other notes</p>
			
 
				-          <ul>
			
 
				-            <li>If either spill threshold is exceeded while a spill is in
			
 
				-            progress, collection will continue until the spill is finished.
			
 
				-            For example, if <code>io.sort.buffer.spill.percent</code> is set
			
 
				-            to 0.33, and the remainder of the buffer is filled while the spill
			
 
				-            runs, the next spill will include all the collected records, or
			
 
				-            0.66 of the buffer, and will not generate additional spills. In
			
 
				-            other words, the thresholds are defining triggers, not
			
 
				-            blocking.</li>
			
 
				-            <li>A record larger than the serialization buffer will first
			
 
				-            trigger a spill, then be spilled to a separate file. It is
			
 
				-            undefined whether or not this record will first pass through the
			
 
				-            combiner.</li>
			
 
				-          </ul>
			
 
				-        </section>
			
 
				-
			
 
				-        <section>
			
 
				-          <title>Shuffle/Reduce Parameters</title>
			
 
				-
			
 
				-          <p>As described previously, each reduce fetches the output assigned
			
 
				-          to it by the Partitioner via HTTP into memory and periodically
			
 
				-          merges these outputs to disk. If intermediate compression of map
			
 
				-          outputs is turned on, each output is decompressed into memory. The
			
 
				-          following options affect the frequency of these merges to disk prior
			
 
				-          to the reduce and the memory allocated to map output during the
			
 
				-          reduce.</p>
			
 
				-
			
 
				-          <table>
			
 
				-            <tr><th>Name</th><th>Type</th><th>Description</th></tr>
			
 
				-            <tr><td>io.sort.factor</td><td>int</td>
			
 
				-                <td>Specifies the number of segments on disk to be merged at
			
 
				-                the same time. It limits the number of open files and
			
 
				-                compression codecs during the merge. If the number of files
			
 
				-                exceeds this limit, the merge will proceed in several passes.
			
 
				-                Though this limit also applies to the map, most jobs should be
			
 
				-                configured so that hitting this limit is unlikely
			
 
				-                there.</td></tr>
			
 
				-            <tr><td>mapred.inmem.merge.threshold</td><td>int</td>
			
 
				-                <td>The number of sorted map outputs fetched into memory
			
 
				-                before being merged to disk. Like the spill thresholds in the
			
 
				-                preceding note, this is not defining a unit of partition, but
			
 
				-                a trigger. In practice, this is usually set very high (1000)
			
 
				-                or disabled (0), since merging in-memory segments is often
			
 
				-                less expensive than merging from disk (see notes following
			
 
				-                this table). This threshold influences only the frequency of
			
 
				-                in-memory merges during the shuffle.</td></tr>
			
 
				-            <tr><td>mapred.job.shuffle.merge.percent</td><td>float</td>
			
 
				-                <td>The memory threshold for fetched map outputs before an
			
 
				-                in-memory merge is started, expressed as a percentage of
			
 
				-                memory allocated to storing map outputs in memory. Since map
			
 
				-                outputs that can't fit in memory can be stalled, setting this
			
 
				-                high may decrease parallelism between the fetch and merge.
			
 
				-                Conversely, values as high as 1.0 have been effective for
			
 
				-                reduces whose input can fit entirely in memory. This parameter
			
 
				-                influences only the frequency of in-memory merges during the
			
 
				-                shuffle.</td></tr>
			
 
				-            <tr><td>mapred.job.shuffle.input.buffer.percent</td><td>float</td>
			
 
				-                <td>The percentage of memory- relative to the maximum heapsize
			
 
				-                as typically specified in <code>mapred.child.java.opts</code>-
			
 
				-                that can be allocated to storing map outputs during the
			
 
				-                shuffle. Though some memory should be set aside for the
			
 
				-                framework, in general it is advantageous to set this high
			
 
				-                enough to store large and numerous map outputs.</td></tr>
			
 
				-            <tr><td>mapred.job.reduce.input.buffer.percent</td><td>float</td>
			
 
				-                <td>The percentage of memory relative to the maximum heapsize
			
 
				-                in which map outputs may be retained during the reduce. When
			
 
				-                the reduce begins, map outputs will be merged to disk until
			
 
				-                those that remain are under the resource limit this defines.
			
 
				-                By default, all map outputs are merged to disk before the
			
 
				-                reduce begins to maximize the memory available to the reduce.
			
 
				-                For less memory-intensive reduces, this should be increased to
			
 
				-                avoid trips to disk.</td></tr>
			
 
				-          </table>
			
 
				-
			
 
				-          <p>Other notes</p>
			
 
				-          <ul>
			
 
				-            <li>If a map output is larger than 25 percent of the memory
			
 
				-            allocated to copying map outputs, it will be written directly to
			
 
				-            disk without first staging through memory.</li>
			
 
				-            <li>When running with a combiner, the reasoning about high merge
			
 
				-            thresholds and large buffers may not hold. For merges started
			
 
				-            before all map outputs have been fetched, the combiner is run
			
 
				-            while spilling to disk. In some cases, one can obtain better
			
 
				-            reduce times by spending resources combining map outputs- making
			
 
				-            disk spills small and parallelizing spilling and fetching- rather
			
 
				-            than aggressively increasing buffer sizes.</li>
			
 
				-            <li>When merging in-memory map outputs to disk to begin the
			
 
				-            reduce, if an intermediate merge is necessary because there are
			
 
				-            segments to spill and at least <code>io.sort.factor</code>
			
 
				-            segments already on disk, the in-memory map outputs will be part
			
 
				-            of the intermediate merge.</li>
			
 
				-          </ul>
			
 
				-
			
 
				-        </section>
			
 
				-
			
 
				-        <section>
			
 
				-        <title> Directory Structure </title>
			
 
				-        <p>The task tracker has local directory,
			
 
				-        <code> ${mapred.local.dir}/taskTracker/</code> to create localized
			
 
				-        cache and localized job. It can define multiple local directories 
			
 
				-        (spanning multiple disks) and then each filename is assigned to a
			
 
				-        semi-random local directory. When the job starts, task tracker 
			
 
				-        creates a localized job directory relative to the local directory
			
 
				-        specified in the configuration. Thus the task tracker directory 
			
 
				-        structure looks the following: </p>         
			
 
				-        <ul>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/archive/</code> :
			
 
				-        The distributed cache. This directory holds the localized distributed
			
 
				-        cache. Thus localized distributed cache is shared among all
			
 
				-        the tasks and jobs </li>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/</code> :
			
 
				-        The localized job directory 
			
 
				-        <ul>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/work/</code> 
			
 
				-        : The job-specific shared directory. The tasks can use this space as 
			
 
				-        scratch space and share files among them. This directory is exposed
			
 
				-        to the users through the configuration property  
			
 
				-        <code>job.local.dir</code>. The directory can accessed through 
			
 
				-        api <a href="ext:api/org/apache/hadoop/mapred/jobconf/getjoblocaldir">
			
 
				-        JobConf.getJobLocalDir()</a>. It is available as System property also.
			
 
				-        So, users (streaming etc.) can call 
			
 
				-        <code>System.getProperty("job.local.dir")</code> to access the 
			
 
				-        directory.</li>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/jars/</code>
			
 
				-        : The jars directory, which has the job jar file and expanded jar.
			
 
				-        The <code>job.jar</code> is the application's jar file that is
			
 
				-        automatically distributed to each machine. It is expanded in jars
			
 
				-        directory before the tasks for the job start. The job.jar location
			
 
				-        is accessible to the application through the api
			
 
				-        <a href="ext:api/org/apache/hadoop/mapred/jobconf/getjar"> 
			
 
				-        JobConf.getJar() </a>. To access the unjarred directory,
			
 
				-        JobConf.getJar().getParent() can be called.</li>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/job.xml</code>
			
 
				-        : The job.xml file, the generic job configuration, localized for 
			
 
				-        the job. </li>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/$taskid</code>
			
 
				-        : The task directory for each task attempt. Each task directory
			
 
				-        again has the following structure :
			
 
				-        <ul>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/$taskid/job.xml</code>
			
 
				-        : A job.xml file, task localized job configuration, Task localization
			
 
				-        means that properties have been set that are specific to
			
 
				-        this particular task within the job. The properties localized for 
			
 
				-        each task are described below.</li>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/$taskid/output</code>
			
 
				-        : A directory for intermediate output files. This contains the
			
 
				-        temporary map reduce data generated by the framework
			
 
				-        such as map output files etc. </li>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/$taskid/work</code>
			
 
				-        : The curernt working directory of the task. 
			
 
				-        With <a href="#Task+JVM+Reuse">jvm reuse</a> enabled for tasks, this 
			
 
				-        directory will be the directory on which the jvm has started</li>
			
 
				-        <li><code>${mapred.local.dir}/taskTracker/jobcache/$jobid/$taskid/work/tmp</code>
			
 
				-        : The temporary directory for the task. 
			
 
				-        (User can specify the property <code>mapred.child.tmp</code> to set
			
 
				-        the value of temporary directory for map and reduce tasks. This 
			
 
				-        defaults to <code>./tmp</code>. If the value is not an absolute path,
			
 
				-        it is prepended with task's working directory. Otherwise, it is
			
 
				-        directly assigned. The directory will be created if it doesn't exist.
			
 
				-        Then, the child java tasks are executed with option
			
 
				-        <code>-Djava.io.tmpdir='the absolute path of the tmp dir'</code>.
			
 
				-        Anp pipes and streaming are set with environment variable,
			
 
				-        <code>TMPDIR='the absolute path of the tmp dir'</code>). This 
			
 
				-        directory is created, if <code>mapred.child.tmp</code> has the value
			
 
				-        <code>./tmp</code> </li>
			
 
				-        </ul>
			
 
				-        </li>
			
 
				-        </ul>
			
 
				-        </li>
			
 
				-        </ul>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-        <title>Task JVM Reuse</title>
			
 
				-        <p>Jobs can enable task JVMs to be reused by specifying the job 
			
 
				-        configuration <code>mapred.job.reuse.jvm.num.tasks</code>. If the
			
 
				-        value is 1 (the default), then JVMs are not reused 
			
 
				-        (i.e. 1 task per JVM). If it is -1, there is no limit to the number
			
 
				-        of tasks a JVM can run (of the same job). One can also specify some
			
 
				-        value greater than 1 using the api 
			
 
				-        <a href="ext:api/org/apache/hadoop/mapred/jobconf/setnumtaskstoexecuteperjvm">
			
 
				-        JobConf.setNumTasksToExecutePerJvm(int)</a></p>
			
 
				-        </section>
			
 
				-
			
 
				-        <p>The following properties are localized in the job configuration 
			
 
				-         for each task's execution: </p>
			
 
				-        <table>
			
 
				-          <tr><th>Name</th><th>Type</th><th>Description</th></tr>
			
 
				-          <tr><td>mapred.job.id</td><td>String</td><td>The job id</td></tr>
			
 
				-          <tr><td>mapred.jar</td><td>String</td>
			
 
				-              <td>job.jar location in job directory</td></tr>
			
 
				-          <tr><td>job.local.dir</td><td> String</td>
			
 
				-              <td> The job specific shared scratch space</td></tr>
			
 
				-          <tr><td>mapred.tip.id</td><td> String</td>
			
 
				-              <td> The task id</td></tr>
			
 
				-          <tr><td>mapred.task.id</td><td> String</td>
			
 
				-              <td> The task attempt id</td></tr>
			
 
				-          <tr><td>mapred.task.is.map</td><td> boolean </td>
			
 
				-              <td>Is this a map task</td></tr>
			
 
				-          <tr><td>mapred.task.partition</td><td> int </td>
			
 
				-              <td>The id of the task within the job</td></tr>
			
 
				-          <tr><td>map.input.file</td><td> String</td>
			
 
				-              <td> The filename that the map is reading from</td></tr>
			
 
				-          <tr><td>map.input.start</td><td> long</td>
			
 
				-              <td> The offset of the start of the map input split</td></tr>
			
 
				-          <tr><td>map.input.length </td><td>long </td>
			
 
				-              <td>The number of bytes in the map input split</td></tr>
			
 
				-          <tr><td>mapred.work.output.dir</td><td> String </td>
			
 
				-              <td>The task's temporary output directory</td></tr>
			
 
				-        </table>
			
 
				-        
			
 
				-        <p>The standard output (stdout) and error (stderr) streams of the task 
			
 
				-        are read by the TaskTracker and logged to 
			
 
				-        <code>${HADOOP_LOG_DIR}/userlogs</code></p>
			
 
				-        
			
 
				-        <p>The <a href="#DistributedCache">DistributedCache</a> can also be used
			
 
				-        to distribute both jars and native libraries for use in the map 
			
 
				-        and/or reduce tasks. The child-jvm always has its 
			
 
				-        <em>current working directory</em> added to the
			
 
				-        <code>java.library.path</code> and <code>LD_LIBRARY_PATH</code>. 
			
 
				-        And hence the cached libraries can be loaded via 
			
 
				-        <a href="http://java.sun.com/javase/6/docs/api/java/lang/System.html#loadLibrary(java.lang.String)">
			
 
				-        System.loadLibrary</a> or 
			
 
				-        <a href="http://java.sun.com/javase/6/docs/api/java/lang/System.html#load(java.lang.String)">
			
 
				-        System.load</a>. More details on how to load shared libraries through 
			
 
				-        distributed cache are documented at 
			
 
				-        <a href="native_libraries.html#Loading+native+libraries+through+DistributedCache">
			
 
				-        native_libraries.html</a></p>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Job Submission and Monitoring</title>
			
 
				-        
			
 
				-        <p><a href="ext:api/org/apache/hadoop/mapred/jobclient">
			
 
				-        JobClient</a> is the primary interface by which user-job interacts
			
 
				-        with the <code>JobTracker</code>.</p>
			
 
				- 
			
 
				-        <p><code>JobClient</code> provides facilities to submit jobs, track their 
			
 
				-        progress, access component-tasks' reports and logs, get the Map/Reduce 
			
 
				-        cluster's status information and so on.</p>
			
 
				- 
			
 
				-        <p>The job submission process involves:</p>
			
 
				-        <ol>
			
 
				-          <li>Checking the input and output specifications of the job.</li>
			
 
				-          <li>Computing the <code>InputSplit</code> values for the job.</li>
			
 
				-          <li>
			
 
				-            Setting up the requisite accounting information for the 
			
 
				-            <code>DistributedCache</code> of the job, if necessary.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Copying the job's jar and configuration to the Map/Reduce system 
			
 
				-            directory on the <code>FileSystem</code>.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Submitting the job to the <code>JobTracker</code> and optionally 
			
 
				-            monitoring it's status.
			
 
				-          </li>
			
 
				-        </ol>
			
 
				-        <p> Job history files are also logged to user specified directory
			
 
				-        <code>hadoop.job.history.user.location</code> 
			
 
				-        which defaults to job output directory. The files are stored in
			
 
				-        "_logs/history/" in the specified directory. Hence, by default they
			
 
				-        will be in mapred.output.dir/_logs/history. User can stop
			
 
				-        logging by giving the value <code>none</code> for 
			
 
				-        <code>hadoop.job.history.user.location</code></p>
			
 
				-
			
 
				-        <p> User can view the history logs summary in specified directory 
			
 
				-        using the following command <br/>
			
 
				-        <code>$ bin/hadoop job -history output-dir</code><br/> 
			
 
				-        This command will print job details, failed and killed tip
			
 
				-        details. <br/>
			
 
				-        More details about the job such as successful tasks and 
			
 
				-        task attempts made for each task can be viewed using the  
			
 
				-        following command <br/>
			
 
				-       <code>$ bin/hadoop job -history all output-dir</code><br/></p> 
			
 
				-            
			
 
				-        <p> User can use 
			
 
				-        <a href="ext:api/org/apache/hadoop/mapred/outputlogfilter">OutputLogFilter</a>
			
 
				-        to filter log files from the output directory listing. </p>
			
 
				-        
			
 
				-        <p>Normally the user creates the application, describes various facets 
			
 
				-        of the job via <code>JobConf</code>, and then uses the 
			
 
				-        <code>JobClient</code> to submit the job and monitor its progress.</p>
			
 
				-
			
 
				-        <section>
			
 
				-          <title>Job Control</title>
			
 
				- 
			
 
				-          <p>Users may need to chain Map/Reduce jobs to accomplish complex
			
 
				-          tasks which cannot be done via a single Map/Reduce job. This is fairly
			
 
				-          easy since the output of the job typically goes to distributed 
			
 
				-          file-system, and the output, in turn, can be used as the input for the 
			
 
				-          next job.</p>
			
 
				- 
			
 
				-          <p>However, this also means that the onus on ensuring jobs are 
			
 
				-          complete (success/failure) lies squarely on the clients. In such 
			
 
				-          cases, the various job-control options are:</p>
			
 
				-          <ul>
			
 
				-            <li>
			
 
				-              <a href="ext:api/org/apache/hadoop/mapred/jobclient/runjob">
			
 
				-              runJob(JobConf)</a> : Submits the job and returns only after the 
			
 
				-              job has completed.
			
 
				-            </li>
			
 
				-            <li>
			
 
				-              <a href="ext:api/org/apache/hadoop/mapred/jobclient/submitjob">
			
 
				-              submitJob(JobConf)</a> : Only submits the job, then poll the 
			
 
				-              returned handle to the 
			
 
				-              <a href="ext:api/org/apache/hadoop/mapred/runningjob">
			
 
				-              RunningJob</a> to query status and make scheduling decisions.
			
 
				-            </li>
			
 
				-            <li>
			
 
				-              <a href="ext:api/org/apache/hadoop/mapred/jobconf/setjobendnotificationuri">
			
 
				-              JobConf.setJobEndNotificationURI(String)</a> : Sets up a 
			
 
				-              notification upon job-completion, thus avoiding polling.
			
 
				-            </li>
			
 
				-          </ul>
			
 
				-        </section>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Job Input</title>
			
 
				-        
			
 
				-        <p><a href="ext:api/org/apache/hadoop/mapred/inputformat">
			
 
				-        InputFormat</a> describes the input-specification for a Map/Reduce job.
			
 
				-        </p> 
			
 
				- 
			
 
				-        <p>The Map/Reduce framework relies on the <code>InputFormat</code> of 
			
 
				-        the job to:</p>
			
 
				-        <ol>
			
 
				-          <li>Validate the input-specification of the job.</li>
			
 
				-          <li>
			
 
				-            Split-up the input file(s) into logical <code>InputSplit</code> 
			
 
				-            instances, each of which is then assigned to an individual 
			
 
				-            <code>Mapper</code>.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Provide the <code>RecordReader</code> implementation used to
			
 
				-            glean input records from the logical <code>InputSplit</code> for 
			
 
				-            processing by the <code>Mapper</code>.
			
 
				-          </li>
			
 
				-        </ol>
			
 
				- 
			
 
				-        <p>The default behavior of file-based <code>InputFormat</code>
			
 
				-        implementations, typically sub-classes of 
			
 
				-        <a href="ext:api/org/apache/hadoop/mapred/fileinputformat">
			
 
				-        FileInputFormat</a>, is to split the input into <em>logical</em> 
			
 
				-        <code>InputSplit</code> instances based on the total size, in bytes, of 
			
 
				-        the input files. However, the <code>FileSystem</code> blocksize of the 
			
 
				-        input files is treated as an upper bound for input splits. A lower bound
			
 
				-        on the split size can be set via <code>mapred.min.split.size</code>.</p>
			
 
				- 
			
 
				-        <p>Clearly, logical splits based on input-size is insufficient for many
			
 
				-        applications since record boundaries must be respected. In such cases, 
			
 
				-        the application should implement a <code>RecordReader</code>, who is 
			
 
				-        responsible for respecting record-boundaries and presents a 
			
 
				-        record-oriented view of the logical <code>InputSplit</code> to the 
			
 
				-        individual task.</p>
			
 
				-
			
 
				-        <p><a href="ext:api/org/apache/hadoop/mapred/textinputformat">
			
 
				-        TextInputFormat</a> is the default <code>InputFormat</code>.</p>
			
 
				-        
			
 
				-        <p>If <code>TextInputFormat</code> is the <code>InputFormat</code> for a 
			
 
				-        given job, the framework detects input-files with the <em>.gz</em>
			
 
				-        extensions and automatically decompresses them using the 
			
 
				-        appropriate <code>CompressionCodec</code>. However, it must be noted that
			
 
				-        compressed files with the above extensions cannot be <em>split</em> and 
			
 
				-        each compressed file is processed in its entirety by a single mapper.</p>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>InputSplit</title>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/inputsplit">
			
 
				-          InputSplit</a> represents the data to be processed by an individual 
			
 
				-          <code>Mapper</code>.</p>
			
 
				-
			
 
				-          <p>Typically <code>InputSplit</code> presents a byte-oriented view of
			
 
				-          the input, and it is the responsibility of <code>RecordReader</code>
			
 
				-          to process and present a record-oriented view.</p>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/filesplit">
			
 
				-          FileSplit</a> is the default <code>InputSplit</code>. It sets 
			
 
				-          <code>map.input.file</code> to the path of the input file for the
			
 
				-          logical split.</p>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>RecordReader</title>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/recordreader">
			
 
				-          RecordReader</a> reads <code>&lt;key, value&gt;</code> pairs from an 
			
 
				-          <code>InputSplit</code>.</p>
			
 
				-
			
 
				-          <p>Typically the <code>RecordReader</code> converts the byte-oriented 
			
 
				-          view of the input, provided by the <code>InputSplit</code>, and 
			
 
				-          presents a record-oriented to the <code>Mapper</code> implementations 
			
 
				-          for processing. <code>RecordReader</code> thus assumes the 
			
 
				-          responsibility of processing record boundaries and presents the tasks 
			
 
				-          with keys and values.</p>
			
 
				-        </section>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Job Output</title>
			
 
				-        
			
 
				-        <p><a href="ext:api/org/apache/hadoop/mapred/outputformat">
			
 
				-        OutputFormat</a> describes the output-specification for a Map/Reduce 
			
 
				-        job.</p>
			
 
				-
			
 
				-        <p>The Map/Reduce framework relies on the <code>OutputFormat</code> of 
			
 
				-        the job to:</p>
			
 
				-        <ol>
			
 
				-          <li>
			
 
				-            Validate the output-specification of the job; for example, check that 
			
 
				-            the output directory doesn't already exist.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Provide the <code>RecordWriter</code> implementation used to 
			
 
				-            write the output files of the job. Output files are stored in a 
			
 
				-            <code>FileSystem</code>.
			
 
				-          </li>
			
 
				-        </ol>
			
 
				- 
			
 
				-        <p><code>TextOutputFormat</code> is the default 
			
 
				-        <code>OutputFormat</code>.</p>
			
 
				-
			
 
				-        <section>
			
 
				-        <title>Lazy Output Creation</title>
			
 
				-        <p>It is possible to delay creation of output until the first write attempt 
			
 
				-           by using <a href="ext:api/org/apache/hadoop/mapred/lib/lazyoutputformat">
			
 
				-           LazyOutputFormat</a>. This is particularly useful in preventing the 
			
 
				-           creation of zero byte files when there is no call to output.collect 
			
 
				-           (or Context.write). This is achieved by calling the static method 
			
 
				-           <code>setOutputFormatClass</code> of <code>LazyOutputFormat</code> 
			
 
				-           with the intended <code>OutputFormat</code> as the argument. The following example 
			
 
				-           shows how to delay creation of files when using the <code>TextOutputFormat</code>
			
 
				-        </p>
			
 
				-
			
 
				-        <p>
			
 
				-        <code> import org.apache.hadoop.mapred.lib.LazyOutputFormat;</code> <br/>
			
 
				-        <code> LazyOutputFormat.setOutputFormatClass(conf, TextOutputFormat.class);</code>
			
 
				-        </p>
			
 
				-         
			
 
				-        </section>
			
 
				-
			
 
				-        <section>
			
 
				-        <title>OutputCommitter</title>
			
 
				-        
			
 
				-        <p><a href="ext:api/org/apache/hadoop/mapred/outputcommitter">
			
 
				-        OutputCommitter</a> describes the commit of task output for a 
			
 
				-        Map/Reduce job.</p>
			
 
				-
			
 
				-        <p>The Map/Reduce framework relies on the <code>OutputCommitter</code>
			
 
				-        of the job to:</p>
			
 
				-        <ol>
			
 
				-          <li>
			
 
				-            Setup the job during initialization. For example, create
			
 
				-            the temporary output directory for the job during the
			
 
				-            initialization of the job. 
			
 
				-            Job setup is done by a separate task when the job is
			
 
				-            in PREP state and after initializing tasks. Once the setup task
			
 
				-            completes, the job will be moved to RUNNING state.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Cleanup the job after the job completion. For example, remove the
			
 
				-            temporary output directory after the job completion.
			
 
				-            Job cleanup is done by a separate task at the end of the job.
			
 
				-            Job is declared SUCCEDED/FAILED/KILLED after the cleanup
			
 
				-            task completes.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Setup the task temporary output.
			
 
				-            Task setup is done as part of the same task, during task initialization.
			
 
				-          </li> 
			
 
				-          <li>
			
 
				-            Check whether a task needs a commit. This is to avoid the commit
			
 
				-            procedure if a task does not need commit.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Commit of the task output. 
			
 
				-            Once task is done, the task will commit it's output if required.  
			
 
				-          </li> 
			
 
				-          <li>
			
 
				-            Discard the task commit.
			
 
				-            If the task has been failed/killed, the output will be cleaned-up. 
			
 
				-            If task could not cleanup (in exception block), a separate task 
			
 
				-            will be launched with same attempt-id to do the cleanup.
			
 
				-          </li>
			
 
				-        </ol>
			
 
				-        <p><code>FileOutputCommitter</code> is the default 
			
 
				-        <code>OutputCommitter</code>. Job setup/cleanup tasks occupy 
			
 
				-        map or reduce slots, whichever is free on the TaskTracker. And
			
 
				-        JobCleanup task, TaskCleanup tasks and JobSetup task have the highest
			
 
				-        priority, and in that order.</p>
			
 
				-        </section>
			
 
				- 
			
 
				-        <section>
			
 
				-          <title>Task Side-Effect Files</title>
			
 
				- 
			
 
				-          <p>In some applications, component tasks need to create and/or write to
			
 
				-          side-files, which differ from the actual job-output files.</p>
			
 
				- 
			
 
				-          <p>In such cases there could be issues with two instances of the same 
			
 
				-          <code>Mapper</code> or <code>Reducer</code> running simultaneously (for
			
 
				-          example, speculative tasks) trying to open and/or write to the same 
			
 
				-          file (path) on the <code>FileSystem</code>. Hence the 
			
 
				-          application-writer will have to pick unique names per task-attempt 
			
 
				-          (using the attemptid, say <code>attempt_200709221812_0001_m_000000_0</code>), 
			
 
				-          not just per task.</p> 
			
 
				- 
			
 
				-          <p>To avoid these issues the Map/Reduce framework, when the 
			
 
				-          <code>OutputCommitter</code> is <code>FileOutputCommitter</code>, 
			
 
				-          maintains a special 
			
 
				-          <code>${mapred.output.dir}/_temporary/_${taskid}</code> sub-directory
			
 
				-          accessible via <code>${mapred.work.output.dir}</code>
			
 
				-          for each task-attempt on the <code>FileSystem</code> where the output
			
 
				-          of the task-attempt is stored. On successful completion of the 
			
 
				-          task-attempt, the files in the 
			
 
				-          <code>${mapred.output.dir}/_temporary/_${taskid}</code> (only) 
			
 
				-          are <em>promoted</em> to <code>${mapred.output.dir}</code>. Of course, 
			
 
				-          the framework discards the sub-directory of unsuccessful task-attempts. 
			
 
				-          This process is completely transparent to the application.</p>
			
 
				- 
			
 
				-          <p>The application-writer can take advantage of this feature by 
			
 
				-          creating any side-files required in <code>${mapred.work.output.dir}</code>
			
 
				-          during execution of a task via 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/fileoutputformat/getworkoutputpath">
			
 
				-          FileOutputFormat.getWorkOutputPath()</a>, and the framework will promote them 
			
 
				-          similarly for succesful task-attempts, thus eliminating the need to 
			
 
				-          pick unique paths per task-attempt.</p>
			
 
				-          
			
 
				-          <p>Note: The value of <code>${mapred.work.output.dir}</code> during 
			
 
				-          execution of a particular task-attempt is actually 
			
 
				-          <code>${mapred.output.dir}/_temporary/_{$taskid}</code>, and this value is 
			
 
				-          set by the Map/Reduce framework. So, just create any side-files in the 
			
 
				-          path  returned by
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/fileoutputformat/getworkoutputpath">
			
 
				-          FileOutputFormat.getWorkOutputPath() </a>from map/reduce 
			
 
				-          task to take advantage of this feature.</p>
			
 
				-          
			
 
				-          <p>The entire discussion holds true for maps of jobs with 
			
 
				-           reducer=NONE (i.e. 0 reduces) since output of the map, in that case, 
			
 
				-           goes directly to HDFS.</p> 
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>RecordWriter</title>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/recordwriter">
			
 
				-          RecordWriter</a> writes the output <code>&lt;key, value&gt;</code> 
			
 
				-          pairs to an output file.</p>
			
 
				-
			
 
				-          <p>RecordWriter implementations write the job outputs to the 
			
 
				-          <code>FileSystem</code>.</p>
			
 
				-        </section>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Other Useful Features</title>
			
 
				- 
			
 
				-        <section>
			
 
				-          <title>Submitting Jobs to Queues</title>
			
 
				-          <p>Users submit jobs to Queues. Queues, as collection of jobs, 
			
 
				-          allow the system to provide specific functionality. For example, 
			
 
				-          queues use ACLs to control which users 
			
 
				-          who can submit jobs to them. Queues are expected to be primarily 
			
 
				-          used by Hadoop Schedulers. </p> 
			
 
				-
			
 
				-          <p>Hadoop comes configured with a single mandatory queue, called 
			
 
				-          'default'. Queue names are defined in the 
			
 
				-          <code>mapred.queue.names</code> property of the Hadoop site
			
 
				-          configuration. Some job schedulers, such as the 
			
 
				-          <a href="capacity_scheduler.html">Capacity Scheduler</a>, 
			
 
				-          support multiple queues.</p>
			
 
				-          
			
 
				-          <p>A job defines the queue it needs to be submitted to through the
			
 
				-          <code>mapred.job.queue.name</code> property, or through the
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setqueuename">setQueueName(String)</a>
			
 
				-          API. Setting the queue name is optional. If a job is submitted 
			
 
				-          without an associated queue name, it is submitted to the 'default' 
			
 
				-          queue.</p> 
			
 
				-        </section>
			
 
				-        <section>
			
 
				-          <title>Counters</title>
			
 
				-          
			
 
				-          <p><code>Counters</code> represent global counters, defined either by 
			
 
				-          the Map/Reduce framework or applications. Each <code>Counter</code> can 
			
 
				-          be of any <code>Enum</code> type. Counters of a particular 
			
 
				-          <code>Enum</code> are bunched into groups of type 
			
 
				-          <code>Counters.Group</code>.</p>
			
 
				-          
			
 
				-          <p>Applications can define arbitrary <code>Counters</code> (of type 
			
 
				-          <code>Enum</code>) and update them via 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/reporter/incrcounterEnum">
			
 
				-          Reporter.incrCounter(Enum, long)</a> or 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/reporter/incrcounterString">
			
 
				-          Reporter.incrCounter(String, String, long)</a>
			
 
				-          in the <code>map</code> and/or 
			
 
				-          <code>reduce</code> methods. These counters are then globally 
			
 
				-          aggregated by the framework.</p>
			
 
				-        </section>       
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>DistributedCache</title>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/filecache/distributedcache">
			
 
				-          DistributedCache</a> distributes application-specific, large, read-only 
			
 
				-          files efficiently.</p>
			
 
				- 
			
 
				-          <p><code>DistributedCache</code> is a facility provided by the 
			
 
				-          Map/Reduce framework to cache files (text, archives, jars and so on) 
			
 
				-          needed by applications.</p>
			
 
				- 
			
 
				-          <p>Applications specify the files to be cached via urls (hdfs://)
			
 
				-          in the <code>JobConf</code>. The <code>DistributedCache</code> 
			
 
				-          assumes that the files specified via hdfs:// urls are already present 
			
 
				-          on the <code>FileSystem</code>.</p>
			
 
				-
			
 
				-          <p>The framework will copy the necessary files to the slave node 
			
 
				-          before any tasks for the job are executed on that node. Its 
			
 
				-          efficiency stems from the fact that the files are only copied once 
			
 
				-          per job and the ability to cache archives which are un-archived on 
			
 
				-          the slaves.</p> 
			
 
				-          
			
 
				-          <p><code>DistributedCache</code> tracks the modification timestamps of 
			
 
				-          the cached files. Clearly the cache files should not be modified by 
			
 
				-          the application or externally while the job is executing.</p>
			
 
				-
			
 
				-          <p><code>DistributedCache</code> can be used to distribute simple, 
			
 
				-          read-only data/text files and more complex types such as archives and
			
 
				-          jars. Archives (zip, tar, tgz and tar.gz files) are 
			
 
				-          <em>un-archived</em> at the slave nodes. Files 
			
 
				-          have <em>execution permissions</em> set. </p>
			
 
				-          
			
 
				-          <p>The files/archives can be distributed by setting the property
			
 
				-          <code>mapred.cache.{files|archives}</code>. If more than one 
			
 
				-          file/archive has to be distributed, they can be added as comma
			
 
				-          separated paths. The properties can also be set by APIs 
			
 
				-          <a href="ext:api/org/apache/hadoop/filecache/distributedcache/addcachefile">
			
 
				-          DistributedCache.addCacheFile(URI,conf)</a>/ 
			
 
				-          <a href="ext:api/org/apache/hadoop/filecache/distributedcache/addcachearchive">
			
 
				-          DistributedCache.addCacheArchive(URI,conf)</a> and
			
 
				-          <a href="ext:api/org/apache/hadoop/filecache/distributedcache/setcachefiles">
			
 
				-          DistributedCache.setCacheFiles(URIs,conf)</a>/
			
 
				-          <a href="ext:api/org/apache/hadoop/filecache/distributedcache/setcachearchives">
			
 
				-          DistributedCache.setCacheArchives(URIs,conf)</a> 
			
 
				-          where URI is of the form
			
 
				-          <code>hdfs://host:port/absolute-path#link-name</code>.
			
 
				-          In Streaming, the files can be distributed through command line
			
 
				-          option <code>-cacheFile/-cacheArchive</code>.</p>
			
 
				-          
			
 
				-          <p>Optionally users can also direct the <code>DistributedCache</code>
			
 
				-          to <em>symlink</em> the cached file(s) into the <code>current working 
			
 
				-          directory</code> of the task via the 
			
 
				-          <a href="ext:api/org/apache/hadoop/filecache/distributedcache/createsymlink">
			
 
				-          DistributedCache.createSymlink(Configuration)</a> api. Or by setting
			
 
				-          the configuration property <code>mapred.create.symlink</code>
			
 
				-          as <code>yes</code>. The DistributedCache will use the 
			
 
				-          <code>fragment</code> of the URI as the name of the symlink. 
			
 
				-          For example, the URI 
			
 
				-          <code>hdfs://namenode:port/lib.so.1#lib.so</code>
			
 
				-          will have the symlink name as <code>lib.so</code> in task's cwd
			
 
				-          for the file <code>lib.so.1</code> in distributed cache.</p>
			
 
				-         
			
 
				-          <p>The <code>DistributedCache</code> can also be used as a 
			
 
				-          rudimentary software distribution mechanism for use in the
			
 
				-          map and/or reduce tasks. It can be used to distribute both
			
 
				-          jars and native libraries. The 
			
 
				-          <a href="ext:api/org/apache/hadoop/filecache/distributedcache/addarchivetoclasspath">
			
 
				-          DistributedCache.addArchiveToClassPath(Path, Configuration)</a> or 
			
 
				-          <a href="ext:api/org/apache/hadoop/filecache/distributedcache/addfiletoclasspath">
			
 
				-          DistributedCache.addFileToClassPath(Path, Configuration)</a> api 
			
 
				-          can be used to cache files/jars and also add them to the 
			
 
				-          <em>classpath</em> of child-jvm. The same can be done by setting
			
 
				-          the configuration properties 
			
 
				-          <code>mapred.job.classpath.{files|archives}</code>. Similarly the
			
 
				-          cached files that are symlinked into the working directory of the
			
 
				-          task can be used to distribute native libraries and load them.</p>
			
 
				-          
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Tool</title>
			
 
				-          
			
 
				-          <p>The <a href="ext:api/org/apache/hadoop/util/tool">Tool</a> 
			
 
				-          interface supports the handling of generic Hadoop command-line options.
			
 
				-          </p>
			
 
				-          
			
 
				-          <p><code>Tool</code> is the standard for any Map/Reduce tool or 
			
 
				-          application. The application should delegate the handling of 
			
 
				-          standard command-line options to 
			
 
				-          <a href="ext:api/org/apache/hadoop/util/genericoptionsparser">
			
 
				-          GenericOptionsParser</a> via          
			
 
				-          <a href="ext:api/org/apache/hadoop/util/toolrunner/run">
			
 
				-          ToolRunner.run(Tool, String[])</a> and only handle its custom 
			
 
				-          arguments.</p>
			
 
				-          
			
 
				-          <p>
			
 
				-            The generic Hadoop command-line options are:<br/>
			
 
				-            <code>
			
 
				-              -conf &lt;configuration file&gt;
			
 
				-            </code>
			
 
				-            <br/>
			
 
				-            <code>
			
 
				-              -D &lt;property=value&gt;
			
 
				-            </code>
			
 
				-            <br/>
			
 
				-            <code>
			
 
				-              -fs &lt;local|namenode:port&gt;
			
 
				-            </code>
			
 
				-            <br/>
			
 
				-            <code>
			
 
				-              -jt &lt;local|jobtracker:port&gt;
			
 
				-            </code>
			
 
				-          </p>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>IsolationRunner</title>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/isolationrunner">
			
 
				-          IsolationRunner</a> is a utility to help debug Map/Reduce programs.</p>
			
 
				-          
			
 
				-          <p>To use the <code>IsolationRunner</code>, first set 
			
 
				-          <code>keep.failed.tasks.files</code> to <code>true</code> 
			
 
				-          (also see <code>keep.tasks.files.pattern</code>).</p>
			
 
				-          
			
 
				-          <p>
			
 
				-            Next, go to the node on which the failed task ran and go to the 
			
 
				-            <code>TaskTracker</code>'s local directory and run the 
			
 
				-            <code>IsolationRunner</code>:<br/>
			
 
				-            <code>$ cd &lt;local path&gt;/taskTracker/${taskid}/work</code><br/>
			
 
				-            <code>
			
 
				-              $ bin/hadoop org.apache.hadoop.mapred.IsolationRunner ../job.xml
			
 
				-            </code>
			
 
				-          </p>
			
 
				-          
			
 
				-          <p><code>IsolationRunner</code> will run the failed task in a single 
			
 
				-          jvm, which can be in the debugger, over precisely the same input.</p>
			
 
				-        </section>
			
 
				-
			
 
				-        <section>
			
 
				-          <title>Profiling</title>
			
 
				-          <p>Profiling is a utility to get a representative (2 or 3) sample
			
 
				-          of built-in java profiler for a sample of maps and reduces. </p>
			
 
				-          
			
 
				-          <p>User can specify whether the system should collect profiler
			
 
				-          information for some of the tasks in the job by setting the
			
 
				-          configuration property <code>mapred.task.profile</code>. The
			
 
				-          value can be set using the api 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setprofileenabled">
			
 
				-          JobConf.setProfileEnabled(boolean)</a>. If the value is set 
			
 
				-          <code>true</code>, the task profiling is enabled. The profiler
			
 
				-          information is stored in the user log directory. By default, 
			
 
				-          profiling is not enabled for the job.  </p>
			
 
				-          
			
 
				-          <p>Once user configures that profiling is needed, she/he can use
			
 
				-          the configuration property 
			
 
				-          <code>mapred.task.profile.{maps|reduces}</code> to set the ranges
			
 
				-          of map/reduce tasks to profile. The value can be set using the api 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setprofiletaskrange">
			
 
				-          JobConf.setProfileTaskRange(boolean,String)</a>.
			
 
				-          By default, the specified range is <code>0-2</code>.</p>
			
 
				-          
			
 
				-          <p>User can also specify the profiler configuration arguments by 
			
 
				-          setting the configuration property 
			
 
				-          <code>mapred.task.profile.params</code>. The value can be specified 
			
 
				-          using the api
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setprofileparams">
			
 
				-          JobConf.setProfileParams(String)</a>. If the string contains a 
			
 
				-          <code>%s</code>, it will be replaced with the name of the profiling
			
 
				-          output file when the task runs. These parameters are passed to the
			
 
				-          task child JVM on the command line. The default value for 
			
 
				-          the profiling parameters is 
			
 
				-          <code>-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s</code>
			
 
				-          </p>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Debugging</title>
			
 
				-          <p>The Map/Reduce framework provides a facility to run user-provided 
			
 
				-          scripts for debugging. When a map/reduce task fails, a user can run 
			
 
				-          a debug script, to process task logs for example. The script is 
			
 
				-          given access to the task's stdout and stderr outputs, syslog and 
			
 
				-          jobconf. The output from the debug script's stdout and stderr is 
			
 
				-          displayed on the console diagnostics and also as part of the 
			
 
				-          job UI. </p>
			
 
				-
			
 
				-          <p> In the following sections we discuss how to submit a debug script
			
 
				-          with a job. The script file needs to be distributed and submitted to 
			
 
				-          the framework.</p>
			
 
				-          <section>
			
 
				-          <title> How to distribute the script file: </title>
			
 
				-          <p>
			
 
				-          The user needs to use  
			
 
				-          <a href="mapred_tutorial.html#DistributedCache">DistributedCache</a>
			
 
				-          to <em>distribute</em> and <em>symlink</em> the script file.</p>
			
 
				-          </section>
			
 
				-          <section>
			
 
				-          <title> How to submit the script: </title>
			
 
				-          <p> A quick way to submit the debug script is to set values for the 
			
 
				-          properties <code>mapred.map.task.debug.script</code> and 
			
 
				-          <code>mapred.reduce.task.debug.script</code>, for debugging map and 
			
 
				-          reduce tasks respectively. These properties can also be set by using APIs 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setmapdebugscript">
			
 
				-          JobConf.setMapDebugScript(String) </a> and
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setreducedebugscript">
			
 
				-          JobConf.setReduceDebugScript(String) </a>. In streaming mode, a debug 
			
 
				-          script can be submitted with the command-line options 
			
 
				-          <code>-mapdebug</code> and <code>-reducedebug</code>, for debugging 
			
 
				-          map and reduce tasks respectively.</p>
			
 
				-            
			
 
				-          <p>The arguments to the script are the task's stdout, stderr, 
			
 
				-          syslog and jobconf files. The debug command, run on the node where
			
 
				-          the map/reduce task failed, is: <br/>
			
 
				-          <code> $script $stdout $stderr $syslog $jobconf </code> </p> 
			
 
				-
			
 
				-          <p> Pipes programs have the c++ program name as a fifth argument
			
 
				-          for the command. Thus for the pipes programs the command is <br/> 
			
 
				-          <code>$script $stdout $stderr $syslog $jobconf $program </code>  
			
 
				-          </p>
			
 
				-          </section>
			
 
				-          
			
 
				-          <section>
			
 
				-          <title> Default Behavior: </title>
			
 
				-          <p> For pipes, a default script is run to process core dumps under
			
 
				-          gdb, prints stack trace and gives info about running threads. </p>
			
 
				-          </section>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>JobControl</title>
			
 
				-          
			
 
				-          <p><a href="ext:api/org/apache/hadoop/mapred/jobcontrol/package-summary">
			
 
				-          JobControl</a> is a utility which encapsulates a set of Map/Reduce jobs
			
 
				-          and their dependencies.</p>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Data Compression</title>
			
 
				-          
			
 
				-          <p>Hadoop Map/Reduce provides facilities for the application-writer to
			
 
				-          specify compression for both intermediate map-outputs and the
			
 
				-          job-outputs i.e. output of the reduces. It also comes bundled with
			
 
				-          <a href="ext:api/org/apache/hadoop/io/compress/compressioncodec">
			
 
				-          CompressionCodec</a> implementation for the 
			
 
				-          <a href="ext:zlib">zlib</a> compression 
			
 
				-          algorithm. The <a href="ext:gzip">gzip</a> file format is also
			
 
				-          supported.</p>
			
 
				-          
			
 
				-          <p>Hadoop also provides native implementations of the above compression
			
 
				-          codecs for reasons of both performance (zlib) and non-availability of
			
 
				-          Java libraries. More details on their usage and availability are
			
 
				-          available <a href="native_libraries.html">here</a>.</p>
			
 
				-          
			
 
				-          <section>
			
 
				-            <title>Intermediate Outputs</title>
			
 
				-            
			
 
				-            <p>Applications can control compression of intermediate map-outputs
			
 
				-            via the 
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/jobconf/setcompressmapoutput">
			
 
				-            JobConf.setCompressMapOutput(boolean)</a> api and the 
			
 
				-            <code>CompressionCodec</code> to be used via the
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/jobconf/setmapoutputcompressorclass">
			
 
				-            JobConf.setMapOutputCompressorClass(Class)</a> api.</p>
			
 
				-          </section>
			
 
				-          
			
 
				-          <section>
			
 
				-            <title>Job Outputs</title>
			
 
				-            
			
 
				-            <p>Applications can control compression of job-outputs via the
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/fileoutputformat/setcompressoutput">
			
 
				-            FileOutputFormat.setCompressOutput(JobConf, boolean)</a> api and the 
			
 
				-            <code>CompressionCodec</code> to be used can be specified via the
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/fileoutputformat/setoutputcompressorclass">
			
 
				-            FileOutputFormat.setOutputCompressorClass(JobConf, Class)</a> api.</p>
			
 
				-            
			
 
				-            <p>If the job outputs are to be stored in the 
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/sequencefileoutputformat">
			
 
				-            SequenceFileOutputFormat</a>, the required
			
 
				-            <code>SequenceFile.CompressionType</code> (i.e. <code>RECORD</code> / 
			
 
				-            <code>BLOCK</code> - defaults to <code>RECORD</code>) can be 
			
 
				-            specified via the 
			
 
				-            <a href="ext:api/org/apache/hadoop/mapred/sequencefileoutputformat/setoutputcompressiontype">
			
 
				-            SequenceFileOutputFormat.setOutputCompressionType(JobConf, 
			
 
				-            SequenceFile.CompressionType)</a> api.</p>
			
 
				-          </section>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Skipping Bad Records</title>
			
 
				-          <p>Hadoop provides an option where a certain set of bad input 
			
 
				-          records can be skipped when processing map inputs. Applications 
			
 
				-          can control this feature through the  
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/skipbadrecords">
			
 
				-          SkipBadRecords</a> class.</p>
			
 
				-          
			
 
				-          <p>This feature can be used when map tasks crash deterministically 
			
 
				-          on certain input. This usually happens due to bugs in the 
			
 
				-          map function. Usually, the user would have to fix these bugs. 
			
 
				-          This is, however, not possible sometimes. The bug may be in third 
			
 
				-          party libraries, for example, for which the source code is not 
			
 
				-          available. In such cases, the task never completes successfully even
			
 
				-          after multiple attempts, and the job fails. With this feature, only 
			
 
				-          a small portion of data surrounding the 
			
 
				-          bad records is lost, which may be acceptable for some applications 
			
 
				-          (those performing statistical analysis on very large data, for 
			
 
				-          example). </p>
			
 
				-
			
 
				-          <p>By default this feature is disabled. For enabling it, 
			
 
				-          refer to <a href="ext:api/org/apache/hadoop/mapred/skipbadrecords/setmappermaxskiprecords">
			
 
				-          SkipBadRecords.setMapperMaxSkipRecords(Configuration, long)</a> and 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/skipbadrecords/setreducermaxskipgroups">
			
 
				-          SkipBadRecords.setReducerMaxSkipGroups(Configuration, long)</a>.
			
 
				-          </p>
			
 
				- 
			
 
				-          <p>With this feature enabled, the framework gets into 'skipping 
			
 
				-          mode' after a certain number of map failures. For more details, 
			
 
				-          see <a href="ext:api/org/apache/hadoop/mapred/skipbadrecords/setattemptsTostartskipping">
			
 
				-          SkipBadRecords.setAttemptsToStartSkipping(Configuration, int)</a>. 
			
 
				-          In 'skipping mode', map tasks maintain the range of records being 
			
 
				-          processed. To do this, the framework relies on the processed record 
			
 
				-          counter. See <a href="ext:api/org/apache/hadoop/mapred/skipbadrecords/counter_map_processed_records">
			
 
				-          SkipBadRecords.COUNTER_MAP_PROCESSED_RECORDS</a> and 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/skipbadrecords/counter_reduce_processed_groups">
			
 
				-          SkipBadRecords.COUNTER_REDUCE_PROCESSED_GROUPS</a>. 
			
 
				-          This counter enables the framework to know how many records have 
			
 
				-          been processed successfully, and hence, what record range caused 
			
 
				-          a task to crash. On further attempts, this range of records is 
			
 
				-          skipped.</p>
			
 
				-     
			
 
				-          <p>The number of records skipped depends on how frequently the 
			
 
				-          processed record counter is incremented by the application. 
			
 
				-          It is recommended that this counter be incremented after every 
			
 
				-          record is processed. This may not be possible in some applications 
			
 
				-          that typically batch their processing. In such cases, the framework 
			
 
				-          may skip additional records surrounding the bad record. Users can 
			
 
				-          control the number of skipped records through 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/skipbadrecords/setmappermaxskiprecords">
			
 
				-          SkipBadRecords.setMapperMaxSkipRecords(Configuration, long)</a> and 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/skipbadrecords/setreducermaxskipgroups">
			
 
				-          SkipBadRecords.setReducerMaxSkipGroups(Configuration, long)</a>. 
			
 
				-          The framework tries to narrow the range of skipped records using a 
			
 
				-          binary search-like approach. The skipped range is divided into two 
			
 
				-          halves and only one half gets executed. On subsequent 
			
 
				-          failures, the framework figures out which half contains 
			
 
				-          bad records. A task will be re-executed till the
			
 
				-          acceptable skipped value is met or all task attempts are exhausted.
			
 
				-          To increase the number of task attempts, use
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setmaxmapattempts">
			
 
				-          JobConf.setMaxMapAttempts(int)</a> and 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/jobconf/setmaxreduceattempts">
			
 
				-          JobConf.setMaxReduceAttempts(int)</a>.
			
 
				-          </p>
			
 
				-          
			
 
				-          <p>Skipped records are written to HDFS in the sequence file 
			
 
				-          format, for later analysis. The location can be changed through 
			
 
				-          <a href="ext:api/org/apache/hadoop/mapred/skipbadrecords/setskipoutputpath">
			
 
				-          SkipBadRecords.setSkipOutputPath(JobConf, Path)</a>.
			
 
				-          </p> 
			
 
				-
			
 
				-        </section>
			
 
				-        
			
 
				-      </section>
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Example: WordCount v2.0</title>
			
 
				-      
			
 
				-      <p>Here is a more complete <code>WordCount</code> which uses many of the
			
 
				-      features provided by the Map/Reduce framework we discussed so far.</p>
			
 
				-      
			
 
				-      <p>This needs the HDFS to be up and running, especially for the 
			
 
				-      <code>DistributedCache</code>-related features. Hence it only works with a 
			
 
				-      <a href="quickstart.html#SingleNodeSetup">pseudo-distributed</a> or
			
 
				-      <a href="quickstart.html#Fully-Distributed+Operation">fully-distributed</a> 
			
 
				-      Hadoop installation.</p>      
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Source Code</title>
			
 
				-        
			
 
				-        <table>
			
 
				-          <tr>
			
 
				-            <th></th>
			
 
				-            <th>WordCount.java</th>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>1.</td>
			
 
				-            <td>
			
 
				-              <code>package org.myorg;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>2.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>3.</td>
			
 
				-            <td>
			
 
				-              <code>import java.io.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>4.</td>
			
 
				-            <td>
			
 
				-              <code>import java.util.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>5.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>6.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.fs.Path;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>7.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.filecache.DistributedCache;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>8.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.conf.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>9.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.io.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>10.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.mapred.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>11.</td>
			
 
				-            <td>
			
 
				-              <code>import org.apache.hadoop.util.*;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>12.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>13.</td>
			
 
				-            <td>
			
 
				-              <code>public class WordCount extends Configured implements Tool {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>14.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>15.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public static class Map extends MapReduceBase 
			
 
				-                implements Mapper&lt;LongWritable, Text, Text, IntWritable&gt; {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>16.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>17.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                static enum Counters { INPUT_WORDS }
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>18.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>19.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                private final static IntWritable one = new IntWritable(1);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>20.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>private Text word = new Text();</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>21.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>22.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>private boolean caseSensitive = true;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>23.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>private Set&lt;String&gt; patternsToSkip = new HashSet&lt;String&gt;();</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>24.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>25.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>private long numRecords = 0;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>26.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>private String inputFile;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>27.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>28.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>public void configure(JobConf job) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>29.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>30.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>inputFile = job.get("map.input.file");</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>31.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>32.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>if (job.getBoolean("wordcount.skip.patterns", false)) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>33.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>Path[] patternsFiles = new Path[0];</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>34.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>try {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>35.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                patternsFiles = DistributedCache.getLocalCacheFiles(job);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>36.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>} catch (IOException ioe) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>37.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                System.err.println("Caught exception while getting cached files: " 
			
 
				-                + StringUtils.stringifyException(ioe));
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>38.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>39.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>for (Path patternsFile : patternsFiles) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>40.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>parseSkipFile(patternsFile);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>41.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>42.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>43.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>44.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>45.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>private void parseSkipFile(Path patternsFile) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>46.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>try {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>47.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                BufferedReader fis = 
			
 
				-                  new BufferedReader(new FileReader(patternsFile.toString()));
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>48.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>String pattern = null;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>49.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>while ((pattern = fis.readLine()) != null) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>50.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>patternsToSkip.add(pattern);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>51.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>52.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>} catch (IOException ioe) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>53.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                System.err.println("Caught exception while parsing the cached file '" +
			
 
				-                                   patternsFile + "' : " + 
			
 
				-                                   StringUtils.stringifyException(ioe));
			
 
				-                
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>54.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>55.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>56.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>57.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public void map(LongWritable key, Text value, 
			
 
				-                OutputCollector&lt;Text, IntWritable&gt; output, 
			
 
				-                Reporter reporter) throws IOException {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>58.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                String line = 
			
 
				-                  (caseSensitive) ? value.toString() : 
			
 
				-                                    value.toString().toLowerCase();
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>59.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>60.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>for (String pattern : patternsToSkip) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>61.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>line = line.replaceAll(pattern, "");</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>62.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>63.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>64.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>StringTokenizer tokenizer = new StringTokenizer(line);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>65.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>while (tokenizer.hasMoreTokens()) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>66.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>word.set(tokenizer.nextToken());</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>67.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>output.collect(word, one);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>68.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>reporter.incrCounter(Counters.INPUT_WORDS, 1);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>69.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>70.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>71.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>if ((++numRecords % 100) == 0) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>72.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                reporter.setStatus("Finished processing " + numRecords + 
			
 
				-                                   " records " + "from the input file: " + 
			
 
				-                                   inputFile);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>73.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>74.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>75.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>76.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>77.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public static class Reduce extends MapReduceBase implements 
			
 
				-                Reducer&lt;Text, IntWritable, Text, IntWritable&gt; {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>78.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public void reduce(Text key, Iterator&lt;IntWritable&gt; values,
			
 
				-                OutputCollector&lt;Text, IntWritable&gt; output, 
			
 
				-                Reporter reporter) throws IOException {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>79.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>int sum = 0;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>80.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>while (values.hasNext()) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>81.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>sum += values.next().get();</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>82.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>83.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>output.collect(key, new IntWritable(sum));</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>84.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>85.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>86.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>87.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>public int run(String[] args) throws Exception {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>88.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                JobConf conf = new JobConf(getConf(), WordCount.class);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>89.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setJobName("wordcount");</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>90.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>91.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setOutputKeyClass(Text.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>92.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setOutputValueClass(IntWritable.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>93.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>94.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setMapperClass(Map.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>95.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setCombinerClass(Reduce.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>96.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setReducerClass(Reduce.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>97.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>98.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setInputFormat(TextInputFormat.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>99.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>conf.setOutputFormat(TextOutputFormat.class);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>100.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>101.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                List&lt;String&gt; other_args = new ArrayList&lt;String&gt;();
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>102.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>for (int i=0; i &lt; args.length; ++i) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>103.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>if ("-skip".equals(args[i])) {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>104.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>105.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                conf.setBoolean("wordcount.skip.patterns", true);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>106.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>} else {</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>107.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>other_args.add(args[i]);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>108.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>109.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>110.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>111.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>112.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>113.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>114.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>JobClient.runJob(conf);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>115.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>return 0;</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>116.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>117.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>118.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                public static void main(String[] args) throws Exception {
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>119.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>
			
 
				-                int res = ToolRunner.run(new Configuration(), new WordCount(), 
			
 
				-                                         args);
			
 
				-              </code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>120.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;&nbsp;&nbsp;
			
 
				-              <code>System.exit(res);</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>121.</td>
			
 
				-            <td>
			
 
				-              &nbsp;&nbsp;
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>122.</td>
			
 
				-            <td>
			
 
				-              <code>}</code>
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td>123.</td>
			
 
				-            <td></td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-      </section>
			
 
				-        
			
 
				-      <section>
			
 
				-        <title>Sample Runs</title>
			
 
				-        
			
 
				-        <p>Sample text-files as input:</p>
			
 
				-        <p>
			
 
				-          <code>$ bin/hadoop dfs -ls /usr/joe/wordcount/input/</code><br/>
			
 
				-          <code>/usr/joe/wordcount/input/file01</code><br/>
			
 
				-          <code>/usr/joe/wordcount/input/file02</code><br/>
			
 
				-          <br/>
			
 
				-          <code>$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file01</code><br/>
			
 
				-          <code>Hello World, Bye World!</code><br/>
			
 
				-          <br/>
			
 
				-          <code>$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file02</code><br/>
			
 
				-          <code>Hello Hadoop, Goodbye to hadoop.</code>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Run the application:</p>
			
 
				-        <p>
			
 
				-          <code>
			
 
				-            $ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount 
			
 
				-              /usr/joe/wordcount/input /usr/joe/wordcount/output 
			
 
				-          </code>
			
 
				-        </p>
			
 
				-
			
 
				-        <p>Output:</p>
			
 
				-        <p>
			
 
				-          <code>
			
 
				-            $ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000
			
 
				-          </code>
			
 
				-          <br/>
			
 
				-          <code>Bye    1</code><br/>
			
 
				-          <code>Goodbye    1</code><br/>
			
 
				-          <code>Hadoop,    1</code><br/>
			
 
				-          <code>Hello    2</code><br/>
			
 
				-          <code>World!    1</code><br/>
			
 
				-          <code>World,    1</code><br/>
			
 
				-          <code>hadoop.    1</code><br/>
			
 
				-          <code>to    1</code><br/>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Notice that the inputs differ from the first version we looked at, 
			
 
				-        and how they affect the outputs.</p>
			
 
				-
			
 
				-        <p>Now, lets plug-in a pattern-file which lists the word-patterns to be 
			
 
				-        ignored, via the <code>DistributedCache</code>.</p>
			
 
				-        
			
 
				-        <p>
			
 
				-          <code>$ hadoop dfs -cat /user/joe/wordcount/patterns.txt</code><br/>
			
 
				-          <code>\.</code><br/>
			
 
				-          <code>\,</code><br/>
			
 
				-          <code>\!</code><br/>
			
 
				-          <code>to</code><br/>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Run it again, this time with more options:</p>
			
 
				-        <p>
			
 
				-          <code>
			
 
				-            $ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount 
			
 
				-              -Dwordcount.case.sensitive=true /usr/joe/wordcount/input 
			
 
				-              /usr/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt
			
 
				-          </code>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>As expected, the output:</p>
			
 
				-        <p>
			
 
				-          <code>
			
 
				-            $ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000
			
 
				-          </code>
			
 
				-          <br/>
			
 
				-          <code>Bye    1</code><br/>
			
 
				-          <code>Goodbye    1</code><br/>
			
 
				-          <code>Hadoop    1</code><br/>
			
 
				-          <code>Hello    2</code><br/>
			
 
				-          <code>World    2</code><br/>
			
 
				-          <code>hadoop    1</code><br/>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Run it once more, this time switch-off case-sensitivity:</p>
			
 
				-        <p>
			
 
				-          <code>
			
 
				-            $ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount 
			
 
				-              -Dwordcount.case.sensitive=false /usr/joe/wordcount/input 
			
 
				-              /usr/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt
			
 
				-          </code>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Sure enough, the output:</p>
			
 
				-        <p>
			
 
				-          <code>
			
 
				-            $ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000
			
 
				-          </code>
			
 
				-          <br/>
			
 
				-          <code>bye    1</code><br/>
			
 
				-          <code>goodbye    1</code><br/>
			
 
				-          <code>hadoop    2</code><br/>
			
 
				-          <code>hello    2</code><br/>
			
 
				-          <code>world    2</code><br/>
			
 
				-        </p>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Highlights</title>
			
 
				-        
			
 
				-        <p>The second version of <code>WordCount</code> improves upon the 
			
 
				-        previous one by using some features offered by the Map/Reduce framework:
			
 
				-        </p>
			
 
				-        <ul>
			
 
				-          <li>
			
 
				-            Demonstrates how applications can access configuration parameters
			
 
				-            in the <code>configure</code> method of the <code>Mapper</code> (and
			
 
				-            <code>Reducer</code>) implementations (lines 28-43).
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Demonstrates how the <code>DistributedCache</code> can be used to 
			
 
				-            distribute read-only data needed by the jobs. Here it allows the user 
			
 
				-            to specify word-patterns to skip while counting (line 104).
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Demonstrates the utility of the <code>Tool</code> interface and the
			
 
				-            <code>GenericOptionsParser</code> to handle generic Hadoop 
			
 
				-            command-line options (lines 87-116, 119).
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Demonstrates how applications can use <code>Counters</code> (line 68)
			
 
				-            and how they can set application-specific status information via 
			
 
				-            the <code>Reporter</code> instance passed to the <code>map</code> (and
			
 
				-            <code>reduce</code>) method (line 72).
			
 
				-          </li>
			
 
				-        </ul>
			
 
				-        
			
 
				-      </section>
			
 
				-    </section>
			
 
				-
			
 
				-    <p>
			
 
				-      <em>Java and JNI are trademarks or registered trademarks of 
			
 
				-      Sun Microsystems, Inc. in the United States and other countries.</em>
			
 
				-    </p>
			
 
				-    
			
 
				-  </body>
			
 
				-  
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/native_libraries.xml
+++ b/src/docs/src/documentation/content/xdocs/native_libraries.xml
@@ -1,212 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-<document>
			
 
				-  
			
 
				-  <header>
			
 
				-    <title>Native Libraries Guide</title>
			
 
				-  </header>
			
 
				-  
			
 
				-  <body>
			
 
				-  
			
 
				-    <section>
			
 
				-      <title>Purpose</title>
			
 
				-      
			
 
				-      <p>Hadoop has native implementations of certain components for reasons of 
			
 
				-      both performance and non-availability of Java implementations. These 
			
 
				-      components are available in a single, dynamically-linked, native library. 
			
 
				-      On the *nix platform it is <em>libhadoop.so</em>. This document describes 
			
 
				-      the usage and details on how to build the native libraries.</p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Components</title>
			
 
				-      
			
 
				-      <p>Hadoop currently has the following 
			
 
				-      <a href="ext:api/org/apache/hadoop/io/compress/compressioncodec">
			
 
				-      compression codecs</a> as the native components:</p>
			
 
				-      <ul>
			
 
				-        <li><a href="ext:zlib">zlib</a></li>
			
 
				-        <li><a href="ext:gzip">gzip</a></li>
			
 
				-        <li><a href="ext:bzip">bzip2</a></li>
			
 
				-      </ul>
			
 
				-      
			
 
				-      <p>Of the above, the availability of native hadoop libraries is imperative 
			
 
				-      for the gzip and bzip2 compression codecs to work.</p>
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Usage</title>
			
 
				-      
			
 
				-      <p>It is fairly simple to use the native hadoop libraries:</p>
			
 
				-
			
 
				-      <ul>
			
 
				-        <li>
			
 
				-          Take a look at the 
			
 
				-          <a href="#Supported+Platforms">supported platforms</a>.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          Either <a href="ext:releases/download">download</a> the pre-built 
			
 
				-          32-bit i386-Linux native hadoop libraries (available as part of hadoop 
			
 
				-          distribution in <code>lib/native</code> directory) or 
			
 
				-          <a href="#Building+Native+Hadoop+Libraries">build</a> them yourself.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          Make sure you have any of or all of <strong>&gt;zlib-1.2</strong>,
			
 
				-          <strong>&gt;gzip-1.2</strong>, and <strong>&gt;bzip2-1.0</strong>
			
 
				-          packages for your platform installed; 
			
 
				-          depending on your needs.
			
 
				-        </li>
			
 
				-      </ul>
			
 
				-      
			
 
				-      <p>The <code>bin/hadoop</code> script ensures that the native hadoop 
			
 
				-      library is on the library path via the system property 
			
 
				-      <em>-Djava.library.path=&lt;path&gt;</em>.</p>
			
 
				-
			
 
				-      <p>To check everything went alright check the hadoop log files for:</p>
			
 
				-      
			
 
				-      <p>
			
 
				-        <code>
			
 
				-          DEBUG util.NativeCodeLoader - Trying to load the custom-built 
			
 
				-          native-hadoop library... 
			
 
				-        </code><br/>
			
 
				-        <code>
			
 
				-          INFO  util.NativeCodeLoader - Loaded the native-hadoop library
			
 
				-        </code>
			
 
				-      </p>
			
 
				-
			
 
				-      <p>If something goes wrong, then:</p>
			
 
				-      <p>
			
 
				-        <code>
			
 
				-          INFO util.NativeCodeLoader - Unable to load native-hadoop library for 
			
 
				-          your platform... using builtin-java classes where applicable
			
 
				-        </code>
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Supported Platforms</title>
			
 
				-      
			
 
				-      <p>Hadoop native library is supported only on *nix platforms only.
			
 
				-      Unfortunately it is known not to work on <a href="ext:cygwin">Cygwin</a> 
			
 
				-      and <a href="ext:osx">Mac OS X</a> and has mainly been used on the 
			
 
				-      GNU/Linux platform.</p>
			
 
				-
			
 
				-      <p>It has been tested on the following GNU/Linux distributions:</p>
			
 
				-      <ul>
			
 
				-        <li>
			
 
				-          <a href="http://www.redhat.com/rhel/">RHEL4</a>/<a href="http://fedora.redhat.com/">Fedora</a>
			
 
				-        </li>
			
 
				-        <li><a href="http://www.ubuntu.com/">Ubuntu</a></li>
			
 
				-        <li><a href="http://www.gentoo.org/">Gentoo</a></li>
			
 
				-      </ul>
			
 
				-
			
 
				-      <p>On all the above platforms a 32/64 bit Hadoop native library will work 
			
 
				-      with a respective 32/64 bit jvm.</p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Building Native Hadoop Libraries</title>
			
 
				-      
			
 
				-      <p>Hadoop native library is written in 
			
 
				-      <a href="http://en.wikipedia.org/wiki/ANSI_C">ANSI C</a> and built using 
			
 
				-      the GNU autotools-chain (autoconf, autoheader, automake, autoscan, libtool). 
			
 
				-      This means it should be straight-forward to build them on any platform with 
			
 
				-      a standards compliant C compiler and the GNU autotools-chain. 
			
 
				-      See <a href="#Supported+Platforms">supported platforms</a>.</p>
			
 
				-
			
 
				-      <p>In particular the various packages you would need on the target 
			
 
				-      platform are:</p>
			
 
				-      <ul>
			
 
				-        <li>
			
 
				-          C compiler (e.g. <a href="http://gcc.gnu.org/">GNU C Compiler</a>)
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          GNU Autools Chain: 
			
 
				-          <a href="http://www.gnu.org/software/autoconf/">autoconf</a>, 
			
 
				-          <a href="http://www.gnu.org/software/automake/">automake</a>, 
			
 
				-          <a href="http://www.gnu.org/software/libtool/">libtool</a>
			
 
				-        </li>
			
 
				-        <li> 
			
 
				-          zlib-development package (stable version >= 1.2.0)
			
 
				-        </li>
			
 
				-      </ul>
			
 
				-
			
 
				-      <p>Once you have the pre-requisites use the standard <code>build.xml</code> 
			
 
				-      and pass along the <code>compile.native</code> flag (set to 
			
 
				-      <code>true</code>) to build the native hadoop library:</p>
			
 
				-
			
 
				-      <p><code>$ ant -Dcompile.native=true &lt;target&gt;</code></p>
			
 
				-
			
 
				-      <p>The native hadoop library is not built by default since not everyone is 
			
 
				-      interested in building them.</p>
			
 
				-
			
 
				-      <p>You should see the newly-built native hadoop library in:</p>
			
 
				-
			
 
				-      <p><code>$ build/native/&lt;platform&gt;/lib</code></p>
			
 
				-
			
 
				-      <p>where &lt;platform&gt; is combination of the system-properties: 
			
 
				-      <code>${os.name}-${os.arch}-${sun.arch.data.model}</code>; for e.g. 
			
 
				-      Linux-i386-32.</p>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Notes</title>
			
 
				-        
			
 
				-        <ul>
			
 
				-          <li>
			
 
				-            It is <strong>mandatory</strong> to have the 
			
 
				-            zlib, gzip, and bzip2
			
 
				-            development packages on the target platform for building the 
			
 
				-            native hadoop library; however for deployment it is sufficient to 
			
 
				-            install one of them if you wish to use only one of them.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            It is necessary to have the correct 32/64 libraries of both zlib 
			
 
				-            depending on the 32/64 bit jvm for the target platform for 
			
 
				-            building/deployment of the native hadoop library.
			
 
				-          </li>
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-    <section>
			
 
				-      <title> Loading native libraries through DistributedCache </title>
			
 
				-      <p>User can load native shared libraries through  
			
 
				-      <a href="mapred_tutorial.html#DistributedCache">DistributedCache</a>
			
 
				-      for <em>distributing</em> and <em>symlinking</em> the library files</p>
			
 
				-      
			
 
				-      <p>Here is an example, describing how to distribute the library and
			
 
				-      load it from map/reduce task. </p>
			
 
				-      <ol>
			
 
				-      <li> First copy the library to the HDFS. <br/>
			
 
				-      <code>bin/hadoop fs -copyFromLocal mylib.so.1 /libraries/mylib.so.1</code>
			
 
				-      </li>
			
 
				-      <li> The job launching program should contain the following: <br/>
			
 
				-      <code> DistributedCache.createSymlink(conf); </code> <br/>
			
 
				-      <code> DistributedCache.addCacheFile("hdfs://host:port/libraries/mylib.so.1#mylib.so", conf);
			
 
				-      </code>
			
 
				-      </li>
			
 
				-      <li> The map/reduce task can contain: <br/>
			
 
				-      <code> System.loadLibrary("mylib.so"); </code>
			
 
				-      </li>
			
 
				-      </ol>
			
 
				-    </section>
			
 
				-  </body>
			
 
				-  
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/quickstart.xml
+++ b/src/docs/src/documentation/content/xdocs/quickstart.xml
@@ -1,296 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-<document>
			
 
				-  
			
 
				-  <header>
			
 
				-    <title>Quick Start</title>
			
 
				-  </header>
			
 
				-  
			
 
				-  <body>
			
 
				-  
			
 
				-    <section>
			
 
				-      <title>Purpose</title>
			
 
				-      
			
 
				-      <p>The purpose of this document is to help you get a single-node Hadoop 
			
 
				-      installation up and running very quickly so that you can get a flavour
			
 
				-      of the Hadoop Distributed File System 
			
 
				-      (see <a href="hdfs_design.html"> <acronym title="Hadoop Distributed File System">HDFS</acronym> Architecture</a>) and 
			
 
				-      the Map/Reduce framework; that is, perform simple operations on HDFS and 
			
 
				-      run example jobs.</p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section id="PreReqs">
			
 
				-      <title>Pre-requisites</title>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Supported Platforms</title>
			
 
				-        
			
 
				-        <ul>
			
 
				-          <li>
			
 
				-            GNU/Linux is supported as a development and production platform. 
			
 
				-            Hadoop has been demonstrated on GNU/Linux clusters with 2000 nodes.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Win32 is supported as a <em>development platform</em>. Distributed 
			
 
				-            operation has not been well tested on Win32, so it is not 
			
 
				-            supported as a <em>production platform</em>.
			
 
				-          </li>
			
 
				-        </ul>        
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Required Software</title>
			
 
				-        <p>Required software for Linux and Windows include:</p>
			
 
				-        <ol>
			
 
				-          <li>
			
 
				-            Java<sup>TM</sup> 1.6.x, preferably from Sun, must be installed.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            <strong>ssh</strong> must be installed and <strong>sshd</strong> must 
			
 
				-            be running to use the Hadoop scripts that manage remote Hadoop 
			
 
				-            daemons.
			
 
				-          </li>
			
 
				-        </ol>
			
 
				-        <p>Additional requirements for Windows include:</p>
			
 
				-        <ol>
			
 
				-          <li>
			
 
				-            <a href="http://www.cygwin.com/">Cygwin</a> - Required for shell 
			
 
				-            support in addition to the required software above. 
			
 
				-          </li>
			
 
				-        </ol>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Installing Software</title>
			
 
				-          
			
 
				-        <p>If your cluster doesn't have the requisite software you will need to
			
 
				-        install it.</p>
			
 
				-          
			
 
				-        <p>For example on Ubuntu Linux:</p>
			
 
				-        <p>
			
 
				-          <code>$ sudo apt-get install ssh</code><br/>
			
 
				-          <code>$ sudo apt-get install rsync</code>
			
 
				-        </p>
			
 
				-          
			
 
				-        <p>On Windows, if you did not install the required software when you 
			
 
				-        installed cygwin, start the cygwin installer and select the packages:</p>
			
 
				-        <ul>
			
 
				-          <li>openssh - the <em>Net</em> category</li>
			
 
				-        </ul>
			
 
				-      </section>
			
 
				-      
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Download</title>
			
 
				-      
			
 
				-      <p>
			
 
				-        To get a Hadoop distribution, download a recent 
			
 
				-        <a href="ext:releases">stable release</a> from one of the Apache Download
			
 
				-        Mirrors.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Prepare to Start the Hadoop Cluster</title>
			
 
				-      <p>
			
 
				-        Unpack the downloaded Hadoop distribution. In the distribution, edit the
			
 
				-        file <code>conf/hadoop-env.sh</code> to define at least 
			
 
				-        <code>JAVA_HOME</code> to be the root of your Java installation.
			
 
				-      </p>
			
 
				-
			
 
				-	  <p>
			
 
				-	    Try the following command:<br/>
			
 
				-        <code>$ bin/hadoop</code><br/>
			
 
				-        This will display the usage documentation for the <strong>hadoop</strong> 
			
 
				-        script.
			
 
				-      </p>
			
 
				-      
			
 
				-      <p>Now you are ready to start your Hadoop cluster in one of the three supported
			
 
				-      modes:
			
 
				-      </p>
			
 
				-      <ul>
			
 
				-        <li>Local (Standalone) Mode</li>
			
 
				-        <li>Pseudo-Distributed Mode</li>
			
 
				-        <li>Fully-Distributed Mode</li>
			
 
				-      </ul>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section id="Local">
			
 
				-      <title>Standalone Operation</title>
			
 
				-      
			
 
				-      <p>By default, Hadoop is configured to run in a non-distributed 
			
 
				-      mode, as a single Java process. This is useful for debugging.</p>
			
 
				-      
			
 
				-      <p>
			
 
				-        The following example copies the unpacked <code>conf</code> directory to 
			
 
				-        use as input and then finds and displays every match of the given regular 
			
 
				-        expression. Output is written to the given <code>output</code> directory.
			
 
				-        <br/>
			
 
				-        <code>$ mkdir input</code><br/>
			
 
				-        <code>$ cp conf/*.xml input</code><br/>
			
 
				-        <code>
			
 
				-          $ bin/hadoop jar hadoop-*-examples.jar grep input output 'dfs[a-z.]+'
			
 
				-        </code><br/>
			
 
				-        <code>$ cat output/*</code>
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section id="PseudoDistributed">
			
 
				-      <title>Pseudo-Distributed Operation</title>
			
 
				-
			
 
				-	  <p>Hadoop can also be run on a single-node in a pseudo-distributed mode 
			
 
				-	  where each Hadoop daemon runs in a separate Java process.</p>
			
 
				-	  
			
 
				-      <section>
			
 
				-        <title>Configuration</title>
			
 
				-        <p>Use the following:
			
 
				-        <br/>
			
 
				-        <code>conf/core-site.xml</code>:</p>
			
 
				-        <table>
			
 
				-        <tr><td>&lt;configuration&gt;</td></tr>
			
 
				-
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;fs.default.name&lt;/name&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;hdfs://localhost:9000&lt;/value&gt;</td></tr>
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
			
 
				-
			
 
				-        <tr><td>&lt;/configuration&gt;</td></tr>
			
 
				-        </table>
			
 
				-      
			
 
				-        <p><br/><code>conf/hdfs-site.xml</code>:</p>
			
 
				-        <table>
			
 
				-        <tr><td>&lt;configuration&gt;</td></tr>
			
 
				-
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;dfs.replication&lt;/name&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;1&lt;/value&gt;</td></tr>
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
			
 
				-
			
 
				-        <tr><td>&lt;/configuration&gt;</td></tr>
			
 
				-        </table>
			
 
				-      
			
 
				-        <p><br/><code>conf/mapred-site.xml</code>:</p>
			
 
				-        <table>
			
 
				-        <tr><td>&lt;configuration&gt;</td></tr>
			
 
				-
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;mapred.job.tracker&lt;/name&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;localhost:9001&lt;/value&gt;</td></tr>
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
			
 
				-
			
 
				-        <tr><td>&lt;/configuration&gt;</td></tr>
			
 
				-        </table>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Setup passphraseless <em>ssh</em></title>
			
 
				-        
			
 
				-        <p>
			
 
				-          Now check that you can ssh to the localhost without a passphrase:<br/>
			
 
				-          <code>$ ssh localhost</code>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>
			
 
				-          If you cannot ssh to localhost without a passphrase, execute the 
			
 
				-          following commands:<br/>
			
 
				-   		  <code>$ ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa</code><br/>
			
 
				-		  <code>$ cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys</code>
			
 
				-		</p>
			
 
				-      </section>
			
 
				-    
			
 
				-      <section>
			
 
				-        <title>Execution</title>
			
 
				-        
			
 
				-        <p>
			
 
				-          Format a new distributed-filesystem:<br/>
			
 
				-          <code>$ bin/hadoop namenode -format</code>
			
 
				-        </p>
			
 
				-
			
 
				-		<p>
			
 
				-		  Start the hadoop daemons:<br/>
			
 
				-          <code>$ bin/start-all.sh</code>
			
 
				-        </p>
			
 
				-
			
 
				-        <p>The hadoop daemon log output is written to the 
			
 
				-        <code>${HADOOP_LOG_DIR}</code> directory (defaults to 
			
 
				-        <code>${HADOOP_HOME}/logs</code>).</p>
			
 
				-
			
 
				-        <p>Browse the web interface for the NameNode and the JobTracker; by
			
 
				-        default they are available at:</p>
			
 
				-        <ul>
			
 
				-          <li>
			
 
				-            <code>NameNode</code> - 
			
 
				-            <a href="http://localhost:50070/">http://localhost:50070/</a>
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            <code>JobTracker</code> - 
			
 
				-            <a href="http://localhost:50030/">http://localhost:50030/</a>
			
 
				-          </li>
			
 
				-        </ul>
			
 
				-        
			
 
				-        <p>
			
 
				-          Copy the input files into the distributed filesystem:<br/>
			
 
				-		  <code>$ bin/hadoop fs -put conf input</code>
			
 
				-		</p>
			
 
				-		
			
 
				-        <p>
			
 
				-          Run some of the examples provided:<br/>
			
 
				-          <code>
			
 
				-            $ bin/hadoop jar hadoop-*-examples.jar grep input output 'dfs[a-z.]+'
			
 
				-          </code>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Examine the output files:</p>
			
 
				-        <p>
			
 
				-          Copy the output files from the distributed filesystem to the local 
			
 
				-          filesytem and examine them:<br/>
			
 
				-          <code>$ bin/hadoop fs -get output output</code><br/>
			
 
				-          <code>$ cat output/*</code>
			
 
				-        </p>
			
 
				-        <p> or </p>
			
 
				-        <p>
			
 
				-          View the output files on the distributed filesystem:<br/>
			
 
				-          <code>$ bin/hadoop fs -cat output/*</code>
			
 
				-        </p>
			
 
				-
			
 
				-		<p>
			
 
				-		  When you're done, stop the daemons with:<br/>
			
 
				-		  <code>$ bin/stop-all.sh</code>
			
 
				-		</p>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section id="FullyDistributed">
			
 
				-      <title>Fully-Distributed Operation</title>
			
 
				-      
			
 
				-	  <p>For information on setting up fully-distributed, non-trivial clusters
			
 
				-	  see <a href="cluster_setup.html">Hadoop Cluster Setup</a>.</p>  
			
 
				-    </section>
			
 
				-    
			
 
				-    <p>
			
 
				-      <em>Java and JNI are trademarks or registered trademarks of 
			
 
				-      Sun Microsystems, Inc. in the United States and other countries.</em>
			
 
				-    </p>
			
 
				-    
			
 
				-  </body>
			
 
				-  
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/service_level_auth.xml
+++ b/src/docs/src/documentation/content/xdocs/service_level_auth.xml
@@ -1,234 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-<document>
			
 
				-  
			
 
				-  <header>
			
 
				-    <title>Service Level Authorization Guide</title>
			
 
				-  </header>
			
 
				-  
			
 
				-  <body>
			
 
				-  
			
 
				-    <section>
			
 
				-      <title>Purpose</title>
			
 
				-      
			
 
				-      <p>This document describes how to configure and manage <em>Service Level
			
 
				-      Authorization</em> for Hadoop.</p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Pre-requisites</title>
			
 
				-      
			
 
				-      <p>Ensure that Hadoop is installed, configured and setup correctly. More
			
 
				-      details:</p> 
			
 
				-      <ul>
			
 
				-        <li>
			
 
				-          <a href="quickstart.html">Hadoop Quick Start</a> for first-time users.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          <a href="cluster_setup.html">Hadoop Cluster Setup</a> for large, 
			
 
				-          distributed clusters.
			
 
				-        </li>
			
 
				-      </ul>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Overview</title>
			
 
				-      
			
 
				-      <p>Service Level Authorization is the initial authorization mechanism to
			
 
				-      ensure clients connecting to a particular Hadoop <em>service</em> have the
			
 
				-      necessary, pre-configured, permissions and are authorized to access the given
			
 
				-      service. For e.g. a Map/Reduce cluster can use this mechanism to allow a
			
 
				-      configured list of users/groups to submit jobs.</p>
			
 
				-      
			
 
				-      <p>The <code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> configuration file 
			
 
				-      is used to define the access control lists for various Hadoop services.</p>
			
 
				-      
			
 
				-      <p>Service Level Authorization is performed much before to other access 
			
 
				-      control checks such as file-permission checks, access control on job queues
			
 
				-      etc.</p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Configuration</title>
			
 
				-      
			
 
				-      <p>This section describes how to configure service-level authorization
			
 
				-      via the configuration file <code>{HADOOP_CONF_DIR}/hadoop-policy.xml</code>.
			
 
				-      </p>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Enable Service Level Authorization</title>
			
 
				-        
			
 
				-        <p>By default, service-level authorization is disabled for Hadoop. To
			
 
				-        enable it set the configuration property 
			
 
				-        <code>hadoop.security.authorization</code> to <strong>true</strong>
			
 
				-        in <code>${HADOOP_CONF_DIR}/core-site.xml</code>.</p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title>Hadoop Services and Configuration Properties</title>
			
 
				-        
			
 
				-        <p>This section lists the various Hadoop services and their configuration
			
 
				-        knobs:</p>
			
 
				-        
			
 
				-        <table>
			
 
				-          <tr>
			
 
				-            <th>Property</th>
			
 
				-            <th>Service</th>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td><code>security.client.protocol.acl</code></td>
			
 
				-            <td>ACL for ClientProtocol, which is used by user code via the 
			
 
				-            DistributedFileSystem.</td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td><code>security.client.datanode.protocol.acl</code></td>
			
 
				-            <td>ACL for ClientDatanodeProtocol, the client-to-datanode protocol
			
 
				-            for block recovery.</td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td><code>security.datanode.protocol.acl</code></td>
			
 
				-            <td>ACL for DatanodeProtocol, which is used by datanodes to 
			
 
				-            communicate with the namenode.</td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td><code>security.inter.datanode.protocol.acl</code></td>
			
 
				-            <td>ACL for InterDatanodeProtocol, the inter-datanode protocol
			
 
				-            for updating generation timestamp.</td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td><code>security.namenode.protocol.acl</code></td>
			
 
				-            <td>ACL for NamenodeProtocol, the protocol used by the secondary
			
 
				-            namenode to communicate with the namenode.</td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td><code>security.inter.tracker.protocol.acl</code></td>
			
 
				-            <td>ACL for InterTrackerProtocol, used by the tasktrackers to 
			
 
				-            communicate with the jobtracker.</td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td><code>security.job.submission.protocol.acl</code></td>
			
 
				-            <td>ACL for JobSubmissionProtocol, used by job clients to 
			
 
				-            communciate with the jobtracker for job submission, querying job status 
			
 
				-            etc.</td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td><code>security.task.umbilical.protocol.acl</code></td>
			
 
				-            <td>ACL for TaskUmbilicalProtocol, used by the map and reduce 
			
 
				-            tasks to communicate with the parent tasktracker.</td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td><code>security.refresh.policy.protocol.acl</code></td>
			
 
				-            <td>ACL for RefreshAuthorizationPolicyProtocol, used by the 
			
 
				-            dfsadmin and mradmin commands to refresh the security policy in-effect.
			
 
				-            </td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Access Control Lists</title>
			
 
				-        
			
 
				-        <p><code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> defines an access 
			
 
				-        control list for each Hadoop service. Every access control list has a 
			
 
				-        simple format:</p>
			
 
				-        
			
 
				-        <p>The list of users and groups are both comma separated list of names. 
			
 
				-        The two lists are separated by a space.</p> 
			
 
				-        
			
 
				-        <p>Example: <code>user1,user2 group1,group2</code>.</p> 
			
 
				-        
			
 
				-        <p>Add a blank at the beginning of the line if only a list of groups
			
 
				-        is to be provided, equivalently a comman-separated list of users followed
			
 
				-        by a space or nothing implies only a set of given users.</p>
			
 
				-        
			
 
				-        <p>A special value of <strong>*</strong> implies that all users are
			
 
				-        allowed to access the service.</p>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Refreshing Service Level Authorization Configuration</title>
			
 
				-        
			
 
				-        <p>The service-level authorization configuration for the NameNode and 
			
 
				-        JobTracker can be changed without restarting either of the Hadoop master
			
 
				-        daemons. The cluster administrator can change 
			
 
				-        <code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> on the master nodes and 
			
 
				-        instruct the NameNode and JobTracker to reload their respective 
			
 
				-        configurations via the <em>-refreshServiceAcl</em> switch to 
			
 
				-        <em>dfsadmin</em> and <em>mradmin</em> commands respectively.</p>
			
 
				-        
			
 
				-        <p>Refresh the service-level authorization configuration for the
			
 
				-        NameNode:</p>
			
 
				-        <p>
			
 
				-          <code>$ bin/hadoop dfsadmin -refreshServiceAcl</code>
			
 
				-        </p>
			
 
				-
			
 
				-        <p>Refresh the service-level authorization configuration for the 
			
 
				-        JobTracker:</p>
			
 
				-        <p>  
			
 
				-          <code>$ bin/hadoop mradmin -refreshServiceAcl</code>
			
 
				-        </p>
			
 
				-        
			
 
				-        <p>Of course, one can use the 
			
 
				-        <code>security.refresh.policy.protocol.acl</code> property in 
			
 
				-        <code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> to restrict access to
			
 
				-        the ability to refresh the service-level authorization configuration to
			
 
				-        certain users/groups.</p>
			
 
				-         
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Examples</title>
			
 
				-        
			
 
				-        <p>Allow only users <code>alice</code>, <code>bob</code> and users in the 
			
 
				-        <code>mapreduce</code> group to submit jobs to the Map/Reduce cluster:</p>
			
 
				-        
			
 
				-        <table>
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;security.job.submission.protocol.acl&lt;/name&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;alice,bob mapreduce&lt;/value&gt;</td></tr>
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
			
 
				-        </table>
			
 
				-        
			
 
				-        <p></p><p>Allow only DataNodes running as the users who belong to the 
			
 
				-        group <code>datanodes</code> to communicate with the NameNode:</p> 
			
 
				-        
			
 
				-        <table>
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;security.datanode.protocol.acl&lt;/name&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt; datanodes&lt;/value&gt;</td></tr>
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
			
 
				-        </table>
			
 
				-        
			
 
				-        <p></p><p>Allow any user to talk to the HDFS cluster as a DFSClient:</p>
			
 
				-        
			
 
				-        <table>
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;property&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;name&gt;security.client.protocol.acl&lt;/name&gt;</td></tr>
			
 
				-            <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;&lt;value&gt;*&lt;/value&gt;</td></tr>
			
 
				-          <tr><td>&nbsp;&nbsp;&lt;/property&gt;</td></tr>
			
 
				-        </table>
			
 
				-        
			
 
				-      </section>
			
 
				-    </section>
			
 
				-    
			
 
				-  </body>
			
 
				-  
			
 
				-</document>
			
--- a/src/docs/src/documentation/content/xdocs/site.xml
+++ b/src/docs/src/documentation/content/xdocs/site.xml
@@ -33,44 +33,20 @@ See http://forrest.apache.org/docs/linking.html for more info.
 
				 <site label="Hadoop" href="" xmlns="http://apache.org/forrest/linkmap/1.0">
			
 
				   
			
 
				    <docs label="Getting Started"> 
			
 
				-		<overview   				label="Overview" 					href="index.html" />
			
 
				-		<quickstart 				label="Quick Start"        		href="quickstart.html" />
			
 
				-		<setup     					label="Cluster Setup"      		href="cluster_setup.html" />
			
 
				-		<mapred    				label="Map/Reduce Tutorial" 	href="mapred_tutorial.html" />
			
 
				-  </docs>	
			
 
				-		
			
 
				- <docs label="Programming Guides">
			
 
				-		<commands 				label="Commands"     					href="commands_manual.html" />
			
 
				-		<distcp    					label="DistCp"       						href="distcp.html" />
			
 
				-		<native_lib    				label="Native Libraries" 					href="native_libraries.html" />
			
 
				-		<streaming 				label="Streaming"          				href="streaming.html" />
			
 
				-		<fair_scheduler 			label="Fair Scheduler" 					href="fair_scheduler.html"/>
			
 
				-        <hdfsproxy 			label="HDFS Proxy" 					href="hdfsproxy.html"/>
			
 
				-		<cap_scheduler 		label="Capacity Scheduler" 			href="capacity_scheduler.html"/>
			
 
				-		<SLA					 	label="Service Level Authorization" 	href="service_level_auth.html"/>
			
 
				-		<vaidya    					label="Vaidya" 								href="vaidya.html"/>
			
 
				-		<archives  				label="Archives"     						href="hadoop_archives.html"/>
			
 
				+     <hdfsproxy 			label="HDFS Proxy" 					href="hdfsproxy.html"/>
			
 
				+     <hdfs_user      				label="User Guide"    							href="hdfs_user_guide.html" />
			
 
				+     <hdfs_arch     				label="Architecture"  								href="hdfs_design.html" />	
			
 
				    </docs>
			
 
				-   
			
 
				-   <docs label="HDFS">
			
 
				-		<hdfs_user      				label="User Guide"    							href="hdfs_user_guide.html" />
			
 
				-		<hdfs_arch     				label="Architecture"  								href="hdfs_design.html" />	
			
 
				-		<hdfs_fs       	 				label="File System Shell Guide"     		href="hdfs_shell.html" />
			
 
				-		<hdfs_perm      				label="Permissions Guide"    					href="hdfs_permissions_guide.html" />
			
 
				-		<hdfs_quotas     			label="Quotas Guide" 							href="hdfs_quota_admin_guide.html" />
			
 
				-		<hdfs_SLG        			label="Synthetic Load Generator Guide"  href="SLG_user_guide.html" />
			
 
				-		<hdfs_imageviewer						label="Offline Image Viewer Guide"	href="hdfs_imageviewer.html" />
			
 
				-		<hdfs_libhdfs   				label="C API libhdfs"         						href="libhdfs.html" /> 
			
 
				-                <docs label="Testing">
			
 
				-                    <faultinject_framework              label="Fault Injection"                                                     href="faultinject_framework.html" />
			
 
				-                </docs>
			
 
				-   </docs> 
			
 
				-   
			
 
				-   <docs label="HOD">
			
 
				-		<hod_user 	label="User Guide" 	href="hod_user_guide.html"/>
			
 
				-		<hod_admin 	label="Admin Guide" 	href="hod_admin_guide.html"/>
			
 
				-		<hod_config 	label="Config Guide" 	href="hod_config_guide.html"/> 
			
 
				-   </docs> 
			
 
				+   <docs label="Guides">
			
 
				+      <hdfs_perm      				label="Permissions Guide"    					href="hdfs_permissions_guide.html" />
			
 
				+      <hdfs_quotas     			label="Quotas Guide" 							href="hdfs_quota_admin_guide.html" />
			
 
				+      <hdfs_SLG        			label="Synthetic Load Generator Guide"  href="SLG_user_guide.html" />
			
 
				+      <hdfs_imageviewer						label="Offline Image Viewer Guide"	href="hdfs_imageviewer.html" />
			
 
				+      <hdfs_libhdfs   				label="C API libhdfs"         						href="libhdfs.html" /> 
			
 
				+    </docs>
			
 
				+    <docs label="Testing">
			
 
				+      <faultinject_framework              label="Fault Injection"                                                     href="faultinject_framework.html" />
			
 
				+    </docs>
			
 
				    
			
 
				    <docs label="Miscellaneous"> 
			
 
				 		<api       	label="API Docs"           href="ext:api/index" />
			
@@ -82,19 +58,20 @@ See http://forrest.apache.org/docs/linking.html for more info.
 
				    </docs> 
			
 
				    
			
 
				   <external-refs>
			
 
				-    <site      href="http://hadoop.apache.org/core/"/>
			
 
				-    <lists     href="http://hadoop.apache.org/core/mailing_lists.html"/>
			
 
				-    <archive   href="http://mail-archives.apache.org/mod_mbox/hadoop-core-commits/"/>
			
 
				-    <releases  href="http://hadoop.apache.org/core/releases.html">
			
 
				-      <download href="#Download" />
			
 
				+    <site      href="http://hadoop.apache.org/hdfs/"/>
			
 
				+    <lists     href="http://hadoop.apache.org/hdfs/mailing_lists.html"/>
			
 
				+    <archive   href="http://mail-archives.apache.org/mod_mbox/hadoop-hdfs-commits/"/>
			
 
				+    <releases  href="http://hadoop.apache.org/hdfs/releases.html">
			
 
				+              <download href="#Download" />
			
 
				     </releases>
			
 
				-    <jira      href="http://hadoop.apache.org/core/issue_tracking.html"/>
			
 
				-    <wiki      href="http://wiki.apache.org/hadoop/" />
			
 
				-    <faq       href="http://wiki.apache.org/hadoop/FAQ" />
			
 
				-    <hadoop-default href="http://hadoop.apache.org/core/docs/current/hadoop-default.html" />
			
 
				-    <core-default href="http://hadoop.apache.org/core/docs/current/core-default.html" />
			
 
				-    <hdfs-default href="http://hadoop.apache.org/core/docs/current/hdfs-default.html" />
			
 
				-    <mapred-default href="http://hadoop.apache.org/core/docs/current/mapred-default.html" />
			
 
				+    <jira      href="http://hadoop.apache.org/hdfs/issue_tracking.html"/>
			
 
				+    <wiki      href="http://wiki.apache.org/hadoop/HDFS" />
			
 
				+    <faq       href="http://wiki.apache.org/hadoop/HDFS/FAQ" />
			
 
				+    
			
 
				+    <common-default href="http://hadoop.apache.org/common/docs/current/common-default.html" />
			
 
				+    <hdfs-default href="http://hadoop.apache.org/hdfs/docs/current/hdfs-default.html" />
			
 
				+    <mapred-default href="http://hadoop.apache.org/mapreduce/docs/current/mapred-default.html" />
			
 
				+    
			
 
				     <zlib      href="http://www.zlib.net/" />
			
 
				     <gzip      href="http://www.gzip.org/" />
			
 
				     <bzip      href="http://www.bzip.org/" />
			
--- a/src/docs/src/documentation/content/xdocs/streaming.xml
+++ b/src/docs/src/documentation/content/xdocs/streaming.xml
@@ -1,670 +0,0 @@
 
				-<?xml version="1.0"?>

			
 
				-<!--

			
 
				-   Licensed to the Apache Software Foundation (ASF) under one or more

			
 
				-   contributor license agreements.  See the NOTICE file distributed with

			
 
				-   this work for additional information regarding copyright ownership.

			
 
				-   The ASF licenses this file to You under the Apache License, Version 2.0

			
 
				-   (the "License"); you may not use this file except in compliance with

			
 
				-   the License.  You may obtain a copy of the License at

			
 
				-

			
 
				-       http://www.apache.org/licenses/LICENSE-2.0

			
 
				-

			
 
				-   Unless required by applicable law or agreed to in writing, software

			
 
				-   distributed under the License is distributed on an "AS IS" BASIS,

			
 
				-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

			
 
				-   See the License for the specific language governing permissions and

			
 
				-   limitations under the License.

			
 
				--->

			
 
				-

			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"

			
 
				-          "http://forrest.apache.org/dtd/document-v20.dtd">

			
 
				-

			
 
				-

			
 
				-<document>

			
 
				-<header>

			
 
				-<title>Hadoop Streaming</title>

			
 
				-<meta name="http-equiv">Content-Type</meta>

			
 
				-<meta name="content">text/html;</meta>

			
 
				-<meta name="charset">utf-8</meta>

			
 
				-</header>

			
 
				-<body>

			
 
				-<section>

			
 
				-<title>Hadoop Streaming</title>

			
 
				-

			
 
				-<p>

			
 
				-Hadoop streaming is a utility that comes with the Hadoop distribution. The utility allows you to create and run Map/Reduce jobs with any executable or script as the mapper and/or the reducer. For example:

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper /bin/cat \

			
 
				-    -reducer /bin/wc

			
 
				-</source>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>How Does Streaming Work </title>

			
 
				-<p>

			
 
				-In the above example, both the mapper and the reducer are executables that read the input from stdin (line by line) and emit the output to stdout. The utility will create a Map/Reduce job, submit the job to an appropriate cluster, and monitor the progress of the job until it completes.

			
 
				-</p><p>

			
 
				-  When an executable is specified for mappers, each mapper task will launch the executable as a separate process when the mapper is initialized. As the mapper task runs, it converts its inputs into lines and feed the lines to the stdin of the process. In the meantime, the mapper collects the line oriented outputs from the stdout of the process and converts each line into a key/value pair, which is collected as the output of the mapper. By default, the 

			
 
				-  <em>prefix of a line up to the first tab character</em> is the <strong>key</strong> and the rest of the line (excluding the tab character) will be the <strong>value</strong>. 

			
 
				-  If there is no tab character in the line, then entire line is considered as key and the value is null. However, this can be customized, as discussed later.

			
 
				-</p>

			
 
				-<p>

			
 
				-When an executable is specified for reducers, each reducer task will launch the executable as a separate process then the reducer is initialized. As the reducer task runs, it converts its input key/values pairs into lines and feeds the lines to the stdin of the process. In the meantime, the reducer collects the line oriented outputs from the stdout of the process, converts each line into a key/value pair, which is collected as the output of the reducer. By default, the prefix of a line up to the first tab character is the key and the rest of the line (excluding the tab character) is the value. However, this can be customized, as discussed later.

			
 
				-</p><p>

			
 
				-This is the basis for the communication protocol between the Map/Reduce framework and the streaming mapper/reducer.

			
 
				-</p><p>

			
 
				-You can supply a Java class as the mapper and/or the reducer. The above example is equivalent to:

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \

			
 
				-    -reducer /bin/wc

			
 
				-</source>

			
 
				-<p>User can specify <code>stream.non.zero.exit.is.failure</code> as 

			
 
				-<code>true</code> or <code>false</code> to make a streaming task that exits 

			
 
				-with a non-zero status to be <code>Failure</code> 

			
 
				-or <code>Success</code> respectively. By default, streaming tasks exiting 

			
 
				-with non-zero status are considered to be failed tasks.</p>

			
 
				-

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Package Files With Job Submissions</title>

			
 
				-<p>

			
 
				-You can specify any executable as the mapper and/or the reducer. The executables do not need to pre-exist on the machines in the cluster; however, if they don't, you will need to use "-file" option to tell the framework to pack your executable files as a part of job submission. For example:

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper myPythonScript.py \

			
 
				-    -reducer /bin/wc \

			
 
				-    -file myPythonScript.py 

			
 
				-</source>

			
 
				-<p>

			
 
				-The above example specifies a user defined Python executable as the mapper. The option "-file myPythonScript.py" causes the python executable shipped to the cluster machines as a part of job submission.

			
 
				-</p>

			
 
				-<p>

			
 
				-In addition to executable files, you can also package other auxiliary files (such as dictionaries, configuration files, etc) that may be used by the mapper and/or the reducer. For example:

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper myPythonScript.py \

			
 
				-    -reducer /bin/wc \

			
 
				-    -file myPythonScript.py \

			
 
				-    -file myDictionary.txt

			
 
				-</source>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Streaming Options and Usage </title>

			
 
				-

			
 
				-<section>

			
 
				-<title>Mapper-Only Jobs </title>

			
 
				-<p>

			
 
				-Often, you may want to process input data using a map function only. To do this, simply set mapred.reduce.tasks to zero. The Map/Reduce framework will not create any reducer tasks. Rather, the outputs of the mapper tasks will be the final output of the job.

			
 
				-</p><p>

			
 
				-To be backward compatible, Hadoop Streaming also supports the "-reduce NONE" option, which is equivalent to "-D mapred.reduce.tasks=0".

			
 
				-</p>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Specifying Other Plugins for Jobs </title>

			
 
				-<p>

			
 
				-Just as with a normal Map/Reduce job, you can specify other plugins for a streaming job:

			
 
				-</p>

			
 
				-<source>

			
 
				-   -inputformat JavaClassName

			
 
				-   -outputformat JavaClassName

			
 
				-   -partitioner JavaClassName

			
 
				-   -combiner streamingCommand or JavaClassName

			
 
				-</source>

			
 
				-<p>

			
 
				-The class you supply for the input format should return key/value pairs of Text class. If you do not specify an input format class, the TextInputFormat is used as the default. Since the TextInputFormat returns keys of LongWritable class, which are actually not part of the input data, the keys will be discarded; only the values will be piped to the streaming mapper.

			
 
				-</p><p>

			
 
				-The class you supply for the output format is expected to take key/value pairs of Text class. If you do not specify an output format class, the TextOutputFormat is used as the default.

			
 
				-</p>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Large files and archives in Hadoop Streaming </title>

			
 
				-

			
 
				-<p>

			
 
				-The -files and -archives options allow you to make files and archives available to the tasks. The argument is a URI to the file or archive that you have already uploaded to HDFS. These files and archives are cached across jobs. You can retrieve the host and fs_port values from the fs.default.name config variable.

			
 
				-</p>

			
 
				-<p>

			
 
				-Here are examples of the -files option:

			
 
				-</p> 

			
 
				-<source>

			
 
				--files hdfs://host:fs_port/user/testfile.txt#testlink

			
 
				-</source>

			
 
				-<p>

			
 
				-In the above example, the part of the url after # is used as the symlink name that is created in the current working directory of tasks. So the tasks will have a symlink called testlink in the cwd that points to a local copy of testfile.txt. Multiple entries can be specified as: 

			
 
				-</p>

			
 
				-<source>

			
 
				--files hdfs://host:fs_port/user/testfile1.txt#testlink1 -files hdfs://host:fs_port/user/testfile2.txt#testlink2

			
 
				-</source>

			
 
				-<p>

			
 
				-The -archives option allows you to copy jars locally to the cwd of tasks and automatically unjar the files. For example:

			
 
				-</p>

			
 
				-<source>

			
 
				--archives hdfs://host:fs_port/user/testfile.jar#testlink3

			
 
				-</source>

			
 
				-<p>

			
 
				-In the example above, a symlink testlink3 is created in the current working directory of tasks. This symlink points to the directory that stores the unjarred contents of the uploaded jar file.

			
 
				-</p>

			
 
				-<p>

			
 
				-Here's another example of the -archives option. Here, the input.txt file has two lines specifying the names of the two files: testlink/cache.txt and testlink/cache2.txt. "testlink" is a symlink to the archived directory, which has the files "cache.txt" and "cache2.txt".

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-                  -input "/user/me/samples/cachefile/input.txt"  \

			
 
				-                  -mapper "xargs cat"  \

			
 
				-                  -reducer "cat"  \

			
 
				-                  -output "/user/me/samples/cachefile/out" \  

			
 
				-                  -archives 'hdfs://hadoop-nn1.example.com/user/me/samples/cachefile/cachedir.jar#testlink' \  

			
 
				-                  -D mapred.map.tasks=1 \

			
 
				-                  -D mapred.reduce.tasks=1 \ 

			
 
				-                  -D mapred.job.name="Experiment"

			
 
				-

			
 
				-$ ls test_jar/

			
 
				-cache.txt  cache2.txt

			
 
				-

			
 
				-$ jar cvf cachedir.jar -C test_jar/ .

			
 
				-added manifest

			
 
				-adding: cache.txt(in = 30) (out= 29)(deflated 3%)

			
 
				-adding: cache2.txt(in = 37) (out= 35)(deflated 5%)

			
 
				-

			
 
				-$ hadoop dfs -put cachedir.jar samples/cachefile

			
 
				-

			
 
				-$ hadoop dfs -cat /user/me/samples/cachefile/input.txt

			
 
				-testlink/cache.txt

			
 
				-testlink/cache2.txt

			
 
				-

			
 
				-$ cat test_jar/cache.txt 

			
 
				-This is just the cache string

			
 
				-

			
 
				-$ cat test_jar/cache2.txt 

			
 
				-This is just the second cache string

			
 
				-

			
 
				-$ hadoop dfs -ls /user/me/samples/cachefile/out      

			
 
				-Found 1 items

			
 
				-/user/me/samples/cachefile/out/part-00000  &lt;r 3&gt;   69

			
 
				-

			
 
				-$ hadoop dfs -cat /user/me/samples/cachefile/out/part-00000

			
 
				-This is just the cache string   

			
 
				-This is just the second cache string

			
 
				-

			
 
				-</source>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Specifying Additional Configuration Variables for Jobs </title>

			
 
				-<p>

			
 
				-You can specify additional configuration variables by using "-D  &lt;n&gt;=&lt;v&gt;". For example: 

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper\

			
 
				-    -reducer /bin/wc \

			
 
				-    -D mapred.reduce.tasks=2

			
 
				-</source>

			
 
				-<p>

			
 
				-The -D mapred.reduce.tasks=2 in the above example specifies to use two reducers for the job.

			
 
				-</p>

			
 
				-<p>

			
 
				-For more details on the jobconf parameters see:

			
 
				-<a href="ext:mapred-default">mapred-default.html</a></p>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Other Supported Options </title>

			
 
				-<p>

			
 
				-Other options you may specify for a streaming job are described here:

			
 
				-</p>

			
 
				-<table>

			
 
				-<tr><th>Parameter</th><th>Optional/Required </th><th>Description </th></tr>

			
 
				-

			
 
				-<tr><td> -cmdenv   name=value </td><td> Optional </td><td> Pass env var to streaming commands </td></tr>

			
 
				-

			
 
				-<tr><td> -inputreader JavaClassName </td><td> Optional </td><td> For backwards-compatibility: specifies a record reader class (instead of an input format class) </td></tr>

			
 
				-<tr><td> -verbose </td><td> Optional </td><td> Verbose output </td></tr>

			
 
				-<tr><td> -lazyOutput </td><td> Optional </td><td> Create output lazily. For example, if the output format is based on FileOutputFormat, the output file is created only on the first call to output.collect (or Context.write)</td></tr>

			
 
				-</table>

			
 
				-<p>

			
 
				-Streaming support Hadoop generic command line options. 

			
 
				-

			
 
				-Supported parameters are : 

			
 
				-The general command line syntax is :

			
 
				-<br/>    bin/hadoop command [genericOptions] [commandOptions]

			
 
				-</p>

			
 
				-

			
 
				-<table>

			
 
				-<tr><th>Parameter</th><th>Optional/Required </th><th>Description </th></tr>

			
 
				-

			
 
				-<tr><td> -conf  configuration_file </td><td> Optional </td><td> specify an application configuration file </td></tr>

			
 
				-<tr><td> -D  property=value </td><td> Optional </td><td> use value for given property </td></tr>

			
 
				-<tr><td> -fs host:port or local </td><td> Optional </td><td> specify a namenode </td></tr>

			
 
				-<tr><td> -jt host:port or local </td><td> Optional </td><td> specify a job tracker </td></tr>

			
 
				-<tr><td> -files </td><td> Optional </td><td> specify comma separated files to be copied to the map reduce cluster </td></tr>

			
 
				-<tr><td> -archives </td><td> Optional </td><td> specify comma separated archives to be unarchived on the compute machines </td></tr>

			
 
				-<tr><td>  </td><td> Optional </td><td>  </td></tr>

			
 
				-<tr><td> -jt host:port or local </td><td> Optional </td><td>  </td></tr>

			
 
				-</table>

			
 
				-

			
 
				-<p>

			
 
				-To change the local temp directory use:

			
 
				-</p>

			
 
				-<source>

			
 
				-  -D dfs.data.dir=/tmp

			
 
				-</source>

			
 
				-<p>

			
 
				-To specify additional local temp directories use:

			
 
				-</p>

			
 
				-<source>

			
 
				-   -D mapred.local.dir=/tmp/local

			
 
				-   -D mapred.system.dir=/tmp/system

			
 
				-   -D mapred.temp.dir=/tmp/temp

			
 
				-</source>

			
 
				-<p>

			
 
				-For more details on jobconf parameters see:

			
 
				-<a href="ext:mapred-default">mapred-default.html</a></p>

			
 
				-<p>

			
 
				-To set an environment variable in a streaming command use:

			
 
				-</p>

			
 
				-<source>

			
 
				--cmdenv EXAMPLE_DIR=/home/example/dictionaries/

			
 
				-</source>

			
 
				-</section>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>More usage examples </title>

			
 
				-

			
 
				-<section>

			
 
				-<title>Customizing the Way to Split Lines into Key/Value Pairs </title>

			
 
				-<p>

			
 
				-As noted earlier, when the Map/Reduce framework reads a line from the stdout of the mapper, it splits the line into a key/value pair. By default, the prefix of the line up to the first tab character is the key and the rest of the line (excluding the tab character) is the value.

			
 
				-</p>

			
 
				-<p>

			
 
				-However, you can customize this default. You can specify a field separator other than the tab character (the default), and you can specify the nth (n >= 1) character rather than the first character in a line (the default) as the separator between the key and value. For example:

			
 
				-</p>

			
 
				-

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \

			
 
				-    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \

			
 
				-    -D stream.map.output.field.separator=. \

			
 
				-    -D stream.num.map.output.key.fields=4 

			
 
				-</source>

			
 
				-<p>

			
 
				-In the above example, "-D stream.map.output.field.separator=." specifies "." as the field separator for the map outputs, and the prefix up to the fourth "." in a line will be the key and the rest of the line (excluding the fourth ".") will be the value. If a line has less than four "."s, then the whole line will be the key and the value will be an empty Text object (like the one created by new Text("")).

			
 
				-</p><p>

			
 
				-Similarly, you can use "-D stream.reduce.output.field.separator=SEP" and "-D stream.num.reduce.output.fields=NUM" to specify the nth field separator in a line of the reduce outputs as the separator between the key and the value.

			
 
				-</p>

			
 
				-<p> Similarly, you can specify "stream.map.input.field.separator" and 

			
 
				-"stream.reduce.input.field.separator" as the input separator for map/reduce 

			
 
				-inputs. By default the separator is the tab character.</p>

			
 
				-</section>

			
 
				-

			
 
				-

			
 
				-<section>

			
 
				-<title>A Useful Partitioner Class (secondary sort, the -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner option) </title>

			
 
				-<p>

			
 
				-Hadoop has a library class, 

			
 
				-<a href="ext:api/org/apache/hadoop/mapred/lib/keyfieldbasedpartitioner">KeyFieldBasedPartitioner</a>, 

			
 
				-that is useful for many applications. This class allows the Map/Reduce 

			
 
				-framework to partition the map outputs based on certain key fields, not

			
 
				-the whole keys. For example:

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \

			
 
				-    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \

			
 
				-    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \

			
 
				-    -D stream.map.output.field.separator=. \

			
 
				-    -D stream.num.map.output.key.fields=4 \

			
 
				-    -D map.output.key.field.separator=. \

			
 
				-    -D mapred.text.key.partitioner.options=-k1,2\

			
 
				-    -D mapred.reduce.tasks=12

			
 
				-</source>

			
 
				-<p>

			
 
				-Here, <em>-D stream.map.output.field.separator=.</em> and <em>-D stream.num.map.output.key.fields=4</em> are as explained in previous example. The two variables are used by streaming to identify the key/value pair of mapper. 

			
 
				-</p><p>

			
 
				-The map output keys of the above Map/Reduce job normally have four fields

			
 
				-separated by ".". However, the Map/Reduce framework will partition the map

			
 
				-outputs by the first two fields of the keys using the 

			
 
				-<em>-D mapred.text.key.partitioner.options=-k1,2</em> option. 

			
 
				-Here, <em>-D map.output.key.field.separator=.</em> specifies the separator 

			
 
				-for the partition. This guarantees that all the key/value pairs with the 

			
 
				-same first two fields in the keys will be partitioned into the same reducer.

			
 
				-</p><p>

			
 
				-<em>This is effectively equivalent to specifying the first two fields as the primary key and the next two fields as the secondary. The primary key is used for partitioning, and the combination of the primary and secondary keys is used for sorting.</em> A simple illustration is shown here:

			
 
				-</p>

			
 
				-<p>

			
 
				-Output of map (the keys)</p><source>

			
 
				-11.12.1.2

			
 
				-11.14.2.3

			
 
				-11.11.4.1

			
 
				-11.12.1.1

			
 
				-11.14.2.2

			
 
				-

			
 
				-</source>

			
 
				-<p>

			
 
				-Partition into 3 reducers (the first 2 fields are used as keys for partition)</p><source>

			
 
				-11.11.4.1

			
 
				------------

			
 
				-11.12.1.2

			
 
				-11.12.1.1

			
 
				------------

			
 
				-11.14.2.3

			
 
				-11.14.2.2

			
 
				-</source>

			
 
				-<p>

			
 
				-Sorting within each partition for the reducer(all 4 fields used for sorting)</p><source>

			
 
				-11.11.4.1

			
 
				------------

			
 
				-11.12.1.1

			
 
				-11.12.1.2

			
 
				------------

			
 
				-11.14.2.2

			
 
				-11.14.2.3

			
 
				-</source>

			
 
				-</section>

			
 
				-<section>

			
 
				-<title>A Useful Comparator Class</title>

			
 
				-<p>

			
 
				-Hadoop has a library class, 

			
 
				-<a href="ext:api/org/apache/hadoop/mapred/lib/keyfieldbasedcomparator">KeyFieldBasedComparator</a>, 

			
 
				-that is useful for many applications. This class provides a subset of features

			
 
				-provided by the Unix/GNU Sort. For example:

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \

			
 
				-    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \

			
 
				-    -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \

			
 
				-    -D stream.map.output.field.separator=. \

			
 
				-    -D stream.num.map.output.key.fields=4 \

			
 
				-    -D map.output.key.field.separator=. \

			
 
				-    -D mapred.text.key.comparator.options=-k2,2nr\

			
 
				-    -D mapred.reduce.tasks=12

			
 
				-</source>

			
 
				-<p>

			
 
				-The map output keys of the above Map/Reduce job normally have four fields

			
 
				-separated by ".". However, the Map/Reduce framework will sort the 

			
 
				-outputs by the second field of the keys using the 

			
 
				-<em>-D mapred.text.key.comparator.options=-k2,2nr</em> option. 

			
 
				-Here, <em>-n</em> specifies that the sorting is numerical sorting and 

			
 
				-<em>-r</em> specifies that the result should be reversed. A simple illustration

			
 
				-is shown below:

			
 
				-</p>

			
 
				-<p>

			
 
				-Output of map (the keys)</p>

			
 
				-<source>

			
 
				-11.12.1.2

			
 
				-11.14.2.3

			
 
				-11.11.4.1

			
 
				-11.12.1.1

			
 
				-11.14.2.2

			
 
				-</source>

			
 
				-<p>

			
 
				-Sorting output for the reducer(where second field used for sorting)</p>

			
 
				-<source>

			
 
				-11.14.2.3

			
 
				-11.14.2.2

			
 
				-11.12.1.2

			
 
				-11.12.1.1

			
 
				-11.11.4.1

			
 
				-</source>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Working with the Hadoop Aggregate Package (the -reduce aggregate option) </title>

			
 
				-<p>

			
 
				-Hadoop has a library package called 

			
 
				-<a href="ext:api/org/apache/hadoop/mapred/lib/aggregate/package-summary">Aggregate</a>.

			
 
				-Aggregate provides a special reducer class and a special combiner class, and

			
 
				-a list of simple aggregators that perform aggregations such as "sum", "max",

			
 
				-"min" and so on  over a sequence of values. Aggregate allows you to define a

			
 
				-mapper plugin class that is expected to generate "aggregatable items" for each

			
 
				-input key/value pair of the mappers. The combiner/reducer will aggregate those

			
 
				-aggregatable items by invoking the appropriate aggregators.

			
 
				-</p><p>

			
 
				-To use Aggregate, simply specify "-reducer aggregate":

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper myAggregatorForKeyCount.py \

			
 
				-    -reducer aggregate \

			
 
				-    -file myAggregatorForKeyCount.py \

			
 
				-    -D mapred.reduce.tasks=12

			
 
				-</source>

			
 
				-<p>

			
 
				-The python program myAggregatorForKeyCount.py looks like:

			
 
				-</p>

			
 
				-<source>

			
 
				-#!/usr/bin/python

			
 
				-

			
 
				-import sys;

			
 
				-

			
 
				-def generateLongCountToken(id):

			
 
				-    return "LongValueSum:" + id + "\t" + "1"

			
 
				-

			
 
				-def main(argv):

			
 
				-    line = sys.stdin.readline();

			
 
				-    try:

			
 
				-        while line:

			
 
				-            line = line&#91;:-1];

			
 
				-            fields = line.split("\t");

			
 
				-            print generateLongCountToken(fields&#91;0]);

			
 
				-            line = sys.stdin.readline();

			
 
				-    except "end of file":

			
 
				-        return None

			
 
				-if __name__ == "__main__":

			
 
				-     main(sys.argv)

			
 
				-</source>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Field Selection ( similar to unix 'cut' command) </title>

			
 
				-<p>

			
 
				-Hadoop has a library class, org.apache.hadoop.mapred.lib.FieldSelectionMapReduce, that effectively allows you to process text data like the unix "cut" utility. The map function defined in the class treats each input key/value pair as a list of fields. You can specify the field separator (the default is the tab character). You can select an arbitrary list of fields as the map output key, and an arbitrary list of fields as the map output value. Similarly, the reduce function defined in the class treats each input key/value pair as a list of fields. You can select an arbitrary list of fields as the reduce output key, and an arbitrary list of fields as the reduce output value. For example:

			
 
				-</p>

			
 
				-<source>

			
 
				-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input myInputDirs \

			
 
				-    -output myOutputDir \

			
 
				-    -mapper org.apache.hadoop.mapred.lib.FieldSelectionMapReduce\

			
 
				-    -reducer org.apache.hadoop.mapred.lib.FieldSelectionMapReduce\

			
 
				-    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \

			
 
				-    -D map.output.key.field.separa=. \

			
 
				-    -D mapred.text.key.partitioner.options=-k1,2 \

			
 
				-    -D mapred.data.field.separator=. \

			
 
				-    -D map.output.key.value.fields.spec=6,5,1-3:0- \

			
 
				-    -D reduce.output.key.value.fields.spec=0-2:5- \

			
 
				-    -D mapred.reduce.tasks=12

			
 
				-</source>

			
 
				-<p>

			
 
				-The option "-D map.output.key.value.fields.spec=6,5,1-3:0-" specifies key/value selection for the map outputs. Key selection spec and value selection spec are separated by ":". In this case, the map output key will consist of fields 6, 5, 1, 2, and 3. The map output value will consist of all fields (0- means field 0 and all 

			
 
				-the subsequent fields). 

			
 
				-</p><p>

			
 
				-The option "-D reduce.output.key.value.fields.spec=0-2:5-" specifies 

			
 
				-key/value selection for the reduce outputs. In this case, the reduce 

			
 
				-output key will consist of fields 0, 1, 2 (corresponding to the original 

			
 
				-fields 6, 5, 1). The reduce output value will consist of all fields starting

			
 
				-from field 5 (corresponding to all the original fields).  

			
 
				-</p>

			
 
				-</section>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Frequently Asked Questions </title>

			
 
				-

			
 
				-<section>

			
 
				-<title>How do I use Hadoop Streaming to run an arbitrary set of (semi-)independent tasks? </title>

			
 
				-<p>

			
 
				-Often you do not need the full power of Map Reduce, but only need to run multiple instances of the same program - either on different parts of the data, or on the same data, but with different parameters. You can use Hadoop Streaming to do this.

			
 
				-</p>

			
 
				-

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>How do I process files, one per map? </title>

			
 
				-<p>

			
 
				-As an example, consider the problem of zipping (compressing) a set of files across the hadoop cluster. You can achieve this using either of these methods:

			
 
				-</p><ol>

			
 
				-<li> Hadoop Streaming and custom mapper script:<ul>

			
 
				-  <li> Generate a file containing the full HDFS path of the input files. Each map task would get one file name as input.</li>

			
 
				-  <li> Create a mapper script which, given a filename, will get the file to local disk, gzip the file and put it back in the desired output directory</li>

			
 
				-</ul></li>

			
 
				-<li>The existing Hadoop Framework:<ul>

			
 
				-   <li>Add these commands to your main function:

			
 
				-<source>

			
 
				-       FileOutputFormat.setCompressOutput(conf, true);

			
 
				-       FileOutputFormat.setOutputCompressorClass(conf, org.apache.hadoop.io.compress.GzipCodec.class);

			
 
				-       conf.setOutputFormat(NonSplitableTextInputFormat.class);

			
 
				-       conf.setNumReduceTasks(0);

			
 
				-</source></li>

			
 
				-   <li>Write your map function:

			
 
				-<source>

			
 
				-

			
 
				-       public void map(WritableComparable key, Writable value, 

			
 
				-                               OutputCollector output, 

			
 
				-                               Reporter reporter) throws IOException {

			
 
				-            output.collect((Text)value, null);

			
 
				-       }

			
 
				-</source></li>

			
 
				-  <li>Note that the output filename will not be the same as the original filename</li>

			
 
				-</ul></li>

			
 
				-</ol>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>How many reducers should I use? </title>

			
 
				-<p>

			
 
				-See the Hadoop Wiki for details: <a href="mapred_tutorial.html#Reducer">Reducer</a>

			
 
				-</p>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>If I set up an alias in my shell script, will that work after -mapper, i.e. say I do: alias c1='cut -f1'. Will -mapper "c1" work? </title>

			
 
				-<p>

			
 
				-Using an alias will not work, but variable substitution is allowed as shown in this example:

			
 
				-</p>

			
 
				-<source>

			
 
				-$ hadoop dfs -cat samples/student_marks

			
 
				-alice   50

			
 
				-bruce   70

			
 
				-charlie 80

			
 
				-dan     75

			
 
				-

			
 
				-$ c2='cut -f2'; $HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-streaming.jar \

			
 
				-    -input /user/me/samples/student_marks 

			
 
				-    -mapper \"$c2\" -reducer 'cat'  

			
 
				-    -output /user/me/samples/student_out 

			
 
				-    -D mapred.job.name='Experiment'

			
 
				-

			
 
				-$ hadoop dfs -ls samples/student_out

			
 
				-Found 1 items/user/me/samples/student_out/part-00000    &lt;r 3&gt;   16

			
 
				-

			
 
				-$ hadoop dfs -cat samples/student_out/part-00000

			
 
				-50

			
 
				-70

			
 
				-75

			
 
				-80

			
 
				-</source>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>Can I use UNIX pipes? For example, will -mapper "cut -f1 | sed s/foo/bar/g" work?</title>

			
 
				-<p>

			
 
				-Currently this does not work and gives an "java.io.IOException: Broken pipe" error. This is probably a bug that needs to be investigated.

			
 
				-</p>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>When I run a streaming job by <strong>distributing large executables</strong> (for example, 3.6G) through the -file option, I get a "No space left on device" error. What do I do? </title>

			
 
				-<p>

			
 
				-The jar packaging happens in a directory pointed to by the configuration variable stream.tmpdir. The default value of stream.tmpdir is /tmp. Set the value to a directory with more space:

			
 
				-</p>

			
 
				-<source>

			
 
				--D stream.tmpdir=/export/bigspace/...

			
 
				-</source>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>How do I specify multiple input directories? </title>

			
 
				-<p>

			
 
				-You can specify multiple input directories with multiple '-input' options:

			
 
				-</p><source>

			
 
				- hadoop jar hadoop-streaming.jar -input '/user/foo/dir1' -input '/user/foo/dir2' 

			
 
				-</source>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>How do I generate output files with gzip format? </title>

			
 
				-<p>

			
 
				-Instead of plain text files, you can generate gzip files as your generated output. Pass '-D mapred.output.compress=true -D  mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec' as option to your streaming job.

			
 
				-</p>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>How do I provide my own input/output format with streaming? </title>

			
 
				-<p>

			
 
				-At least as late as version 0.14, Hadoop does not support multiple jar files. So, when specifying your own custom classes you will have to pack them along with the streaming jar and use the custom jar instead of the default hadoop streaming jar. 

			
 
				-</p>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>How do I parse XML documents using streaming? </title>

			
 
				-<p>

			
 
				-You can use the record reader StreamXmlRecordReader to process XML documents. 

			
 
				-</p>

			
 
				-<source>

			
 
				-hadoop jar hadoop-streaming.jar -inputreader "StreamXmlRecord,begin=BEGIN_STRING,end=END_STRING" ..... (rest of the command)

			
 
				-</source>

			
 
				-<p>

			
 
				-Anything found between BEGIN_STRING and END_STRING would be treated as one record for map tasks.

			
 
				-</p>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>How do I update counters in streaming applications? </title>

			
 
				-<p>

			
 
				-A streaming process can use the stderr to emit counter information.

			
 
				-<code>reporter:counter:&lt;group&gt;,&lt;counter&gt;,&lt;amount&gt;</code> 

			
 
				-should be sent to stderr to update the counter.

			
 
				-</p>

			
 
				-</section>

			
 
				-

			
 
				-<section>

			
 
				-<title>How do I update status in streaming applications? </title>

			
 
				-<p>

			
 
				-A streaming process can use the stderr to emit status information.

			
 
				-To set a status, <code>reporter:status:&lt;message&gt;</code> should be sent 

			
 
				-to stderr.

			
 
				-</p>

			
 
				-</section>

			
 
				-

			
 
				-</section>

			
 
				-</body>

			
 
				-</document>

			
--- a/src/docs/src/documentation/content/xdocs/tabs.xml
+++ b/src/docs/src/documentation/content/xdocs/tabs.xml
@@ -30,8 +30,8 @@
 
				     directory (ends in '/'), in which case /index.html will be added
			
 
				   -->
			
 
				 
			
 
				-  <tab label="Project" href="http://hadoop.apache.org/core/" />
			
 
				-  <tab label="Wiki" href="http://wiki.apache.org/hadoop" />
			
 
				-  <tab label="Hadoop 0.21 Documentation" dir="" />  
			
 
				+  <tab label="Project" href="http://hadoop.apache.org/hdfs/" />
			
 
				+  <tab label="Wiki" href="http://wiki.apache.org/hadoop/hdfs" />
			
 
				+  <tab label="HDFS 0.21 Documentation" dir="" />  
			
 
				   
			
 
				 </tabs>
			
--- a/src/docs/src/documentation/content/xdocs/vaidya.xml
+++ b/src/docs/src/documentation/content/xdocs/vaidya.xml
@@ -1,172 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-<document>
			
 
				-  
			
 
				-  <header>
			
 
				-    <title>Vaidya Guide</title>
			
 
				-  </header>
			
 
				-  
			
 
				-  <body>
			
 
				-  
			
 
				-    <section>
			
 
				-      <title>Purpose</title>
			
 
				-      
			
 
				-      <p>This document describes various user-facing facets of Hadoop Vaidya, a performance diagnostic tool for map/reduce jobs. It
			
 
				-         describes how to execute a default set of rules against your map/reduce job counters and
			
 
				-         how to write and execute new rules to detect specific performance problems. 
			
 
				-      </p>
			
 
				-      <p>A few sample test rules are provided with the tool with the objective of growing the rules database over the time. 
			
 
				-         You are welcome to contribute new rules for everyone's benefit; to do so, follow the 
			
 
				-         <a href="http://wiki.apache.org/hadoop/HowToContribute">How to Contribute</a> procedure
			
 
				-         specified on Apache Hadoop website.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Pre-requisites</title>
			
 
				-      
			
 
				-      <p>Ensure that Hadoop is installed and configured. More details:</p> 
			
 
				-      <ul>
			
 
				-        <li>
			
 
				-          Make sure HADOOP_HOME environment variable is set.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          Make sure Java is installed and configured as a part of the Hadoop installation.
			
 
				-        </li>
			
 
				-      </ul>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Overview</title>
			
 
				-      
			
 
				-      <p>Hadoop Vaidya (Vaidya in Sanskrit language means "one who knows", or "a physician") 
			
 
				-	    is a rule based performance diagnostic tool for 
			
 
				-        Map/Reduce jobs. It performs a post execution analysis of map/reduce 
			
 
				-        job by parsing and collecting execution statistics through job history 
			
 
				-        and job configuration files. It runs a set of predefined tests/rules 
			
 
				-        against job execution statistics to diagnose various performance problems. 
			
 
				-        Each test rule detects a specific performance problem with the Map/Reduce job and provides 
			
 
				-        a targeted advice to the user. This tool generates an XML report based on 
			
 
				-        the evaluation results of individual test rules.
			
 
				-      </p>
			
 
				-      
			
 
				-    </section>
			
 
				-  
			
 
				-    <section>
			
 
				-	 <title>Terminology</title>
			
 
				-	 
			
 
				-	 <p> This section describes main concepts and terminology involved with Hadoop Vaidya,</p>
			
 
				-		<ul>
			
 
				-			<li> <em>PostExPerformanceDiagnoser</em>: This class extends the base Diagnoser class and acts as a driver for post execution performance analysis of Map/Reduce Jobs. 
			
 
				-                       It detects performance inefficiencies by executing a set of performance diagnosis rules against the job execution statistics.</li>
			
 
				-			<li> <em>Job Statistics</em>: This includes the job configuration information (job.xml) and various counters logged by Map/Reduce job as a part of the job history log
			
 
				-		           file. The counters are parsed and collected into the Job Statistics data structures, which contains global job level aggregate counters and 
			
 
				-			     a set of counters for each Map and Reduce task.</li>
			
 
				-			<li> <em>Diagnostic Test/Rule</em>: This is a program logic that detects the inefficiency of M/R job based on the job statistics. The
			
 
				-				 description of the Test is specified as an XML element (DiagnosticTest) in a test description file e.g. 
			
 
				-				 default tests description file, <em>$HADOOP_HOME/contrib/vaidya/conf/postex_diagnosis_tests.xml</em>. The actual logic is coded as
			
 
				-				 a java class and referenced in the DiagnosticTest XML element. </li>
			
 
				-		</ul>
			
 
				-	<p></p>
			
 
				-	<p>Following section describes the <em>DiagnosticTest</em> XML element in a diagnostic test description file </p>
			
 
				-		<ul>
			
 
				-			<li> <em>DiagnosticTest{Title}</em>: Specifies a short name/description of the test.</li>
			
 
				-			<li> <em>DiagnosticTest{ClassName}</em>: Specifies fully qualified class name that implements the test logic.</li>
			
 
				-			<li> <em>DiagnosticTest{Description}</em>: Specifies a full description of the test rule.</li>
			
 
				-			<li> <em>DiagnosticTest{Importance}</em>: Specifies a declarative value for overall importance of the test rule. (Values: High, Medium, Low)</li>
			
 
				-			<li> <em>DiagnosticTest{SuccessThreshod}</em>: This is a threshold value specified by test case writer such that if impact level of the test case
			
 
				-				 is lesser, then test is declared as PASSED (or NEGATIVE). The impact level is calculated and returned
			
 
				-				 by individual test's evaluate function, specifying the degree of problem job has with respect to the condition being evaluated.</li>
			
 
				-			<li> <em>DiagnosticTest{Prescription}</em>: This is a targeted advice written by the test case adviser for the user to follow when test is not PASSED. </li>
			
 
				-			<li> <em>DiagonsticTest{InputElement}</em>: This is a test specific input that test writer has to optionally provide. This will be supplied to individual test case
			
 
				-                       class so that test writer can use it within test case. This is typically a test configuration information such that test writer need not change the
			
 
				-                       Java code for test case but rather can configure the test case using these input values. </li>
			
 
				-		</ul>
			
 
				-	<p></p>
			
 
				-	<p>Following section describes the performance analysis report generated by the tool in XML format</p>
			
 
				-		<ul>
			
 
				-			<li> <em>PostExPerformanceDiagnosticReport</em>: This is a document (root) element from the XML report generated by the tool. </li>
			
 
				-			<li> <em>TestReportElement</em>: This is a XML report element from the test report document, one for each individual test specified in test description
			
 
				-				 file </li>  
			
 
				-			<li> <em>TestReportElement{TestTitle}</em>: Will be included from DiagnosticTest{Title} </li>
			
 
				-			<li> <em>TestReportElement{TestDescription}</em>: Will be included from DiagnosticTest{Description} </li>
			
 
				-			<li> <em>TestReportElement{TestImportance}</em>: Will be included from DiagnosticTest{Importance} </li>
			
 
				-			<li> <em>TestReportElement{TestSeverity}</em>: This is a product of Test Impact level and Test Importance. It indicates overall severity of the test.</li>
			
 
				-			<li> <em>TestReportElement{ReferenceDetails}</em>: This is a test specific runtime information provided by test case to support the test result and severity. Typically
			
 
				-				 Test writer should print the test impact level in this section. </li>
			
 
				-			<li> <em>TestReportElement{TestResults}</em>: This is boolean outcome of the test based on the SuccessThreshold specified by test writer in the DiagnosticTest description. The 
			
 
				-				 test PASSED(NEGATIVE) indicates no problem vs. FAILED (POSITIVE) indicates a potential problem with the job for given test case. </li>
			
 
				-			<li> <em>TestReportElement{TestPrescription}</em>: This will be included from DiagnosticTest{Prescription}, unless test case writer overrides it in the test case class through getPrescription()
			
 
				-				 method </li>
			
 
				-		</ul>	 
			
 
				-	</section>
			
 
				-	
			
 
				-	<section>
			
 
				-		<title>How to Execute the Hadoop Vaidya Tool</title>
			
 
				-		  
			
 
				-      	<p>Script to execute Hadoop Vaidya is in <code>$HADOOP_HOME/contrib/vaidya/bin/</code> directory.
			
 
				-		   It comes with a default set of rules defined in file: 
			
 
				-           <code>$HADOOP_HOME/contrib/vaidya/conf/postex_diagnosis_tests.xml</code> </p>
			
 
				-		  <ul>
			
 
				-			<li>Make sure HADOOP_HOME environment variable is set and Java is installed and configured.</li>
			
 
				-			<li>Execute the Hadoop Vaidya script with -help (or without any arguments) to get the command line help. e.g. 
			
 
				-                       <code>=>sh $HADOOP_HOME/contrib/vaidya/bin/vaidya.sh -help</code></li>
			
 
				-			<li>User needs to 
			
 
				-				 supply job's configuration file (<code>-jobconf job_conf.xml</code>), job history log file (<code>-joblog job_history_log_file</code>), and optionally the test description
			
 
				-				 file (<code>-testconf postex_diagonostic_tests.xml</code>). If test description file is not specified then the default one is picked up from the Hadoop Vaidya Jar (<code>$HADOOP_HOME/contrib/vaidya/hadoop-{version}-vaidya.jar</code>).
			
 
				-				 This default test description file is also available at following location for users to make a local copy, modify and add new test rules: 
			
 
				-			     <code>$HADOOP_HOME/contrib/vaidya/conf/postex_diagnostic_tests.xml</code></li>
			
 
				-			<li> Use <code>-report report_file</code> option to store the xml report into specified report_file. </li>  
			
 
				-		 </ul>
			
 
				-	</section>
			
 
				-	
			
 
				-    <section>
			
 
				-		<title>How to Write and Execute your own Tests</title>
			
 
				-		<p>Writing and executing your own test rules is not very hard. You can take a look at Hadoop Vaidya source code for existing set of tests. 
			
 
				-		   The source code is at this <a href="http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/vaidya/src/java/org/apache/hadoop/vaidya/">hadoop svn repository location</a>
			
 
				-		   . The default set of tests are under <code>"postexdiagnosis/tests/"</code> folder.</p>
			
 
				-		<ul>
			
 
				-		  <li>Writing a test class for your new test case should extend the <code>org.apache.hadoop.vaidya.DiagnosticTest</code> class and 
			
 
				-		       it should override following three methods from the base class, 
			
 
				-              <ul> 
			
 
				-				<li> evaluate() </li>
			
 
				-				<li> getPrescription() </li> 
			
 
				- 				<li> getReferenceDetails() </li> 
			
 
				-              </ul>
			
 
				-          </li>
			
 
				-		  <li>Make a local copy of the <code>$HADOOP_HOME/contrib/vaidya/conf/postex_diagnostic_tests.xml</code> file or create a new test description XML file.</li>
			
 
				-		  <li>Add the test description element for your new test case to this test description file.</li>
			
 
				-		  <li>Compile your new test class (or multiple classes), archive them into a Jar file and add it to the CLASSPATH e.g. (<code>export CLASSPATH=$CLASSPATH:newtests.jar</code>)</li>
			
 
				-		  <li>Execute the Hadoop Vaidya script with the job configuration, job history log and reference to newly created test description file using <em>--testconf</em> option. 
			
 
				-		  <code>=>sh $HADOOP_HOME/contrib/vaidya/bin/vaidya.sh -joblog job_history_log_file -jobconf job.xml -testconf new_test_description_file -report report.xml</code></li>
			
 
				-		</ul>
			
 
				-	</section>
			
 
				-	
			
 
				-    <p> </p>
			
 
				-    <p> </p>
			
 
				-    <p>
			
 
				-      <em>Java and JNI are trademarks or registered trademarks of 
			
 
				-      Sun Microsystems, Inc. in the United States and other countries.</em>
			
 
				-    </p>
			
 
				-    
			
 
				-  </body>
			
 
				-  
			
 
				-</document>
			
--- a/src/docs/src/documentation/skinconf.xml
+++ b/src/docs/src/documentation/skinconf.xml
@@ -67,8 +67,8 @@ which will be used to configure the chosen Forrest skin.
 
				   <!-- project logo -->
			
 
				   <project-name>Hadoop</project-name>
			
 
				   <project-description>Scalable Computing Platform</project-description>
			
 
				-  <project-url>http://hadoop.apache.org/core/</project-url>
			
 
				-  <project-logo>images/core-logo.gif</project-logo>
			
 
				+  <project-url>http://hadoop.apache.org/hdfs/</project-url>
			
 
				+  <project-logo>images/hdfs-logo.jpg</project-logo>
			
 
				 
			
 
				   <!-- group logo -->
			
 
				   <group-name>Hadoop</group-name>
			
@@ -146,13 +146,13 @@ which will be used to configure the chosen Forrest skin.
 
				     <!--Headers -->
			
 
				 	#content h1 {
			
 
				 	  margin-bottom: .5em;
			
 
				-	  font-size: 200%; color: black;
			
 
				+	  font-size: 185%; color: black;
			
 
				 	  font-family: arial;
			
 
				 	}  
			
 
				-    h2, .h3 { font-size: 195%; color: black; font-family: arial; }
			
 
				-	h3, .h4 { font-size: 140%; color: black; font-family: arial; margin-bottom: 0.5em; }
			
 
				+    h2, .h3 { font-size: 175%; color: black; font-family: arial; }
			
 
				+	h3, .h4 { font-size: 135%; color: black; font-family: arial; margin-bottom: 0.5em; }
			
 
				 	h4, .h5 { font-size: 125%; color: black;  font-style: italic; font-weight: bold; font-family: arial; }
			
 
				-	h5, h6 { font-size: 110%; color: #363636; font-weight: bold; } 
			
 
				+	h5, h6 { font-size: 110%; color: #363636; font-weight: bold; }    
			
 
				    
			
 
				    <!--Code Background -->
			
 
				     pre.code {
			
--- a/src/java/org/apache/hadoop/hdfs/DFSClient.java
+++ b/src/java/org/apache/hadoop/hdfs/DFSClient.java
@@ -575,6 +575,35 @@ public class DFSClient implements FSConstants, java.io.Closeable {
 
				     leasechecker.put(src, result);
			
 
				     return result;
			
 
				   }
			
 
				+  
			
 
				+  /**
			
 
				+   * Same as {{@link #create(String, FsPermission, EnumSet, short, long,
			
 
				+   *  Progressable, int)}   except that the permission
			
 
				+   *   is absolute (ie has already been masked with umask.
			
 
				+   * 
			
 
				+   */
			
 
				+  public OutputStream primitiveCreate(String src, 
			
 
				+                             FsPermission absPermission,
			
 
				+                             EnumSet<CreateFlag> flag,
			
 
				+                             boolean createParent,
			
 
				+                             short replication,
			
 
				+                             long blockSize,
			
 
				+                             Progressable progress,
			
 
				+                             int buffersize,
			
 
				+                             int bytesPerChecksum)
			
 
				+    throws IOException {
			
 
				+    checkOpen();
			
 
				+    if (absPermission == null) {
			
 
				+      absPermission = 
			
 
				+        FsPermission.getDefault().applyUMask(FsPermission.getUMask(conf));
			
 
				+    } 
			
 
				+    LOG.debug(src + ": masked=" + absPermission);
			
 
				+    OutputStream result = new DFSOutputStream(src, absPermission,
			
 
				+        flag, createParent, replication, blockSize, progress, buffersize,
			
 
				+        bytesPerChecksum);
			
 
				+    leasechecker.put(src, result);
			
 
				+    return result;
			
 
				+  } 
			
 
				 
			
 
				   /**
			
 
				    * Append to an existing HDFS file.  
			
@@ -1003,6 +1032,28 @@ public class DFSClient implements FSConstants, java.io.Closeable {
 
				                                      FileAlreadyExistsException.class);
			
 
				     }
			
 
				   }
			
 
				+  
			
 
				+  /**
			
 
				+   * Same {{@link #mkdirs(String, FsPermission, boolean)} except
			
 
				+   * that the permissions has already been masked against umask.
			
 
				+   */
			
 
				+  public boolean primitiveMkdir(String src, FsPermission absPermission)
			
 
				+    throws IOException{
			
 
				+    checkOpen();
			
 
				+    if (absPermission == null) {
			
 
				+      absPermission = 
			
 
				+        FsPermission.getDefault().applyUMask(FsPermission.getUMask(conf));
			
 
				+    } 
			
 
				+
			
 
				+    LOG.debug(src + ": masked=" + absPermission);
			
 
				+    try {
			
 
				+      return namenode.mkdirs(src, absPermission, true);
			
 
				+    } catch(RemoteException re) {
			
 
				+      throw re.unwrapRemoteException(AccessControlException.class,
			
 
				+                                     NSQuotaExceededException.class,
			
 
				+                                     DSQuotaExceededException.class);
			
 
				+    }
			
 
				+  }
			
 
				 
			
 
				   ContentSummary getContentSummary(String src) throws IOException {
			
 
				     try {
			
--- a/src/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
+++ b/src/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
@@ -177,6 +177,13 @@ public class DistributedFileSystem extends FileSystem {
 
				     }
			
 
				     return dfs.getBlockLocations(getPathName(file.getPath()), start, len);
			
 
				   }
			
 
				+  
			
 
				+  @Override
			
 
				+  public BlockLocation[] getFileBlockLocations(Path p, 
			
 
				+      long start, long len) throws IOException {
			
 
				+    return dfs.getBlockLocations(getPathName(p), start, len);
			
 
				+
			
 
				+  }
			
 
				 
			
 
				   @Override
			
 
				   public void setVerifyChecksum(boolean verifyChecksum) {
			
@@ -203,11 +210,21 @@ public class DistributedFileSystem extends FileSystem {
 
				     EnumSet<CreateFlag> flag, int bufferSize, short replication, long blockSize,
			
 
				     Progressable progress) throws IOException {
			
 
				 
			
 
				-    return new FSDataOutputStream
			
 
				-       (dfs.create(getPathName(f), permission,
			
 
				+    return new FSDataOutputStream(dfs.create(getPathName(f), permission,
			
 
				                    flag, replication, blockSize, progress, bufferSize),
			
 
				         statistics);
			
 
				   }
			
 
				+  
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  @Override
			
 
				+  protected FSDataOutputStream primitiveCreate(Path f,
			
 
				+    FsPermission absolutePermission, EnumSet<CreateFlag> flag, int bufferSize,
			
 
				+    short replication, long blockSize, Progressable progress,
			
 
				+    int bytesPerChecksum) throws IOException {
			
 
				+    return new FSDataOutputStream(dfs.primitiveCreate(getPathName(f),
			
 
				+        absolutePermission, flag, true, replication, blockSize,
			
 
				+        progress, bufferSize, bytesPerChecksum),statistics);
			
 
				+   } 
			
 
				 
			
 
				   /**
			
 
				    * Same as create(), except fails if parent directory doesn't already exist.
			
@@ -293,6 +310,13 @@ public class DistributedFileSystem extends FileSystem {
 
				     return dfs.mkdirs(getPathName(f), permission, true);
			
 
				   }
			
 
				 
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  @Override
			
 
				+  protected boolean primitiveMkdir(Path f, FsPermission absolutePermission)
			
 
				+    throws IOException {
			
 
				+    return dfs.primitiveMkdir(getPathName(f), absolutePermission);
			
 
				+  }
			
 
				+
			
 
				   /** {@inheritDoc} */
			
 
				   @Override
			
 
				   public void close() throws IOException {
			
--- a/src/java/org/apache/hadoop/hdfs/HftpFileSystem.java
+++ b/src/java/org/apache/hadoop/hdfs/HftpFileSystem.java
@@ -39,14 +39,12 @@ import org.apache.hadoop.conf.Configuration;
 
				 import org.apache.hadoop.fs.CreateFlag;
			
 
				 import org.apache.hadoop.fs.FSDataInputStream;
			
 
				 import org.apache.hadoop.fs.FSDataOutputStream;
			
 
				-import org.apache.hadoop.fs.FSInputStream;
			
 
				 import org.apache.hadoop.fs.FileChecksum;
			
 
				 import org.apache.hadoop.fs.FileStatus;
			
 
				 import org.apache.hadoop.fs.FileSystem;
			
 
				 import org.apache.hadoop.fs.MD5MD5CRC32FileChecksum;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				 import org.apache.hadoop.fs.permission.FsPermission;
			
 
				-import org.apache.hadoop.hdfs.server.namenode.ListPathsServlet;
			
 
				 import org.apache.hadoop.ipc.RemoteException;
			
 
				 import org.apache.hadoop.net.NetUtils;
			
 
				 import org.apache.hadoop.security.UnixUserGroupInformation;
			
@@ -59,7 +57,6 @@ import org.xml.sax.SAXException;
 
				 import org.xml.sax.XMLReader;
			
 
				 import org.xml.sax.helpers.DefaultHandler;
			
 
				 import org.xml.sax.helpers.XMLReaderFactory;
			
 
				-import org.apache.hadoop.hdfs.ByteRangeInputStream;
			
 
				 
			
 
				 
			
 
				 
			
@@ -298,7 +295,7 @@ public class HftpFileSystem extends FileSystem {
 
				 
			
 
				   @Override
			
 
				   public Path getWorkingDirectory() {
			
 
				-    return new Path("/").makeQualified(this);
			
 
				+    return new Path("/").makeQualified(getUri(), null);
			
 
				   }
			
 
				 
			
 
				   @Override
			
--- a/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicy.java
+++ b/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicy.java
@@ -19,10 +19,8 @@ package org.apache.hadoop.hdfs.server.namenode;
 
				 
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				 import org.apache.hadoop.hdfs.protocol.Block;
			
 
				-import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
			
 
				 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
			
 
				 import org.apache.hadoop.net.NetworkTopology;
			
 
				-import org.apache.hadoop.net.Node; 
			
 
				 import org.apache.hadoop.util.ReflectionUtils;
			
 
				 import java.util.*;
			
 
				 
			
--- a/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicyDefault.java
+++ b/src/java/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicyDefault.java
@@ -23,7 +23,6 @@ import org.apache.hadoop.hdfs.protocol.Block;
 
				 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
			
 
				 import org.apache.hadoop.hdfs.protocol.FSConstants;
			
 
				 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
			
 
				-import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
			
 
				 import org.apache.hadoop.net.NetworkTopology;
			
 
				 import org.apache.hadoop.net.Node;
			
 
				 import org.apache.hadoop.net.NodeBase;
			
--- a/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
+++ b/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
@@ -2443,7 +2443,6 @@ public class FSNamesystem implements FSConstants, FSNamesystemMBean, FSClusterSt
 
				     boolean firstOne = true;
			
 
				     while (nonExcess.size() - replication > 0) {
			
 
				       DatanodeInfo cur = null;
			
 
				-      long minSpace = Long.MAX_VALUE;
			
 
				 
			
 
				       // check if we can del delNodeHint
			
 
				       if (firstOne && delNodeHint !=null && nonExcess.contains(delNodeHint) &&
			
--- a/src/java/org/apache/hadoop/hdfs/server/namenode/ReplicationTargetChooser.java
+++ b/src/java/org/apache/hadoop/hdfs/server/namenode/ReplicationTargetChooser.java
--- a/src/test/hdfs/org/apache/hadoop/fs/TestHDFSFileContextMainOperations.java
+++ b/src/test/hdfs/org/apache/hadoop/fs/TestHDFSFileContextMainOperations.java
@@ -0,0 +1,76 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.fs;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import javax.security.auth.login.LoginException;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.hdfs.MiniDFSCluster;
			
 
				+import org.apache.hadoop.security.UnixUserGroupInformation;
			
 
				+import org.junit.After;
			
 
				+import org.junit.AfterClass;
			
 
				+import org.junit.Before;
			
 
				+import org.junit.BeforeClass;
			
 
				+import org.junit.Test;
			
 
				+
			
 
				+public class TestHDFSFileContextMainOperations extends
			
 
				+                                  FileContextMainOperationsBaseTest {
			
 
				+  
			
 
				+  private static MiniDFSCluster cluster;
			
 
				+  private static Path defaultWorkingDirectory;
			
 
				+  
			
 
				+  @BeforeClass
			
 
				+  public static void clusterSetupAtBegining()
			
 
				+                                    throws IOException, LoginException  {
			
 
				+    cluster = new MiniDFSCluster(new Configuration(), 2, true, null);
			
 
				+    fc = FileContext.getFileContext(cluster.getFileSystem());
			
 
				+    defaultWorkingDirectory = fc.makeQualified( new Path("/user/" + 
			
 
				+        UnixUserGroupInformation.login().getUserName()));
			
 
				+    fc.mkdirs(defaultWorkingDirectory, FileContext.DEFAULT_PERM);
			
 
				+  }
			
 
				+
			
 
				+      
			
 
				+  @AfterClass
			
 
				+  public static void ClusterShutdownAtEnd() throws Exception {
			
 
				+    cluster.shutdown();   
			
 
				+  }
			
 
				+  
			
 
				+  @Before
			
 
				+  public void setUp() throws Exception {
			
 
				+  }
			
 
				+  
			
 
				+  @Override
			
 
				+  @After
			
 
				+  public void tearDown() throws Exception {
			
 
				+    super.tearDown();
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  protected Path getDefaultWorkingDirectory() {
			
 
				+    return defaultWorkingDirectory;
			
 
				+  } 
			
 
				+  
			
 
				+  @Override
			
 
				+  @Test
			
 
				+  public void testRenameFileAsExistingFile() throws Exception {
			
 
				+    // ignore base class test till hadoop-6240 is fixed
			
 
				+  }
			
 
				+}