12 年前 · b324a6134e
--- a/hadoop-common-project/hadoop-common/CHANGES.txt
+++ b/hadoop-common-project/hadoop-common/CHANGES.txt
@@ -415,6 +415,8 @@ Release 2.0.3-alpha - Unreleased
 
				     HADOOP-9147. Add missing fields to FIleStatus.toString.
			
 
				     (Jonathan Allen via suresh)
			
 
				 
			
 
				+    HADOOP-8427. Convert Forrest docs to APT, incremental. (adi2 via tucu)
			
 
				+
			
 
				   OPTIMIZATIONS
			
 
				 
			
 
				     HADOOP-8866. SampleQuantiles#query is O(N^2) instead of O(N). (Andrew Wang
			
--- a/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/HttpAuthentication.xml
+++ b/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/HttpAuthentication.xml
@@ -1,127 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Copyright 2002-2004 The Apache Software Foundation
			
 
				-
			
 
				-  Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-  you may not use this file except in compliance with the License.
			
 
				-  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
			
 
				-          "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-
			
 
				-<document>
			
 
				-
			
 
				-  <header>
			
 
				-    <title> 
			
 
				-      Authentication for Hadoop HTTP web-consoles
			
 
				-    </title>
			
 
				-  </header>
			
 
				-
			
 
				-  <body>
			
 
				-    <section>
			
 
				-      <title> Introduction </title>
			
 
				-      <p>
			
 
				-        This document describes how to configure Hadoop HTTP web-consoles to require user
			
 
				-        authentication. 
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        By default Hadoop HTTP web-consoles (JobTracker, NameNode, TaskTrackers and DataNodes) allow 
			
 
				-        access without any form of authentication. 
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        Similarly to Hadoop RPC, Hadoop HTTP web-consoles can be configured to require Kerberos 
			
 
				-        authentication using HTTP SPNEGO protocol (supported by browsers like Firefox and Internet
			
 
				-        Explorer).        
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        In addition, Hadoop HTTP web-consoles support the equivalent of Hadoop's Pseudo/Simple
			
 
				-        authentication. If this option is enabled, user must specify their user name in the first
			
 
				-        browser interaction using the <code>user.name</code> query string parameter. For example:
			
 
				-        <code>http://localhost:50030/jobtracker.jsp?user.name=babu</code>.
			
 
				-      </p>
			
 
				-      <p>
			
 
				-        If a custom authentication mechanism is required for the HTTP web-consoles, it is possible 
			
 
				-        to implement a plugin to support the alternate authentication mechanism (refer to 
			
 
				-        Hadoop hadoop-auth for details on writing an <code>AuthenticatorHandler</code>).
			
 
				-      </p>
			
 
				-      <p>       
			
 
				-        The next section describes how to configure Hadoop HTTP web-consoles to require user 
			
 
				-        authentication.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-
			
 
				-    <section> 
			
 
				-      <title> Configuration </title>
			
 
				-
			
 
				-      <p>
			
 
				-        The following properties should be in the <code>core-site.xml</code> of all the nodes
			
 
				-        in the cluster.
			
 
				-      </p>
			
 
				-
			
 
				-      <p><code>hadoop.http.filter.initializers</code>: add to this property the 
			
 
				-      <code>org.apache.hadoop.security.AuthenticationFilterInitializer</code> initializer class.
			
 
				-      </p>
			
 
				-      
			
 
				-      <p><code>hadoop.http.authentication.type</code>: Defines authentication used for the HTTP 
			
 
				-      web-consoles. The supported values are: <code>simple | kerberos | 
			
 
				-      #AUTHENTICATION_HANDLER_CLASSNAME#</code>. The dfeault value is <code>simple</code>.
			
 
				-      </p>
			
 
				-
			
 
				-      <p><code>hadoop.http.authentication.token.validity</code>: Indicates how long (in seconds) 
			
 
				-      an authentication token is valid before it has to be renewed. The default value is 
			
 
				-      <code>36000</code>.
			
 
				-      </p>
			
 
				-
			
 
				-      <p><code>hadoop.http.authentication.signature.secret.file</code>: The signature secret 
			
 
				-      file for signing the authentication tokens. If not set a random secret is generated at 
			
 
				-      startup time. The same secret should be used for all nodes in the cluster, JobTracker, 
			
 
				-      NameNode, DataNode and TastTracker. The default value is 
			
 
				-      <code>${user.home}/hadoop-http-auth-signature-secret</code>.
			
 
				-      IMPORTANT: This file should be readable only by the Unix user running the daemons.
			
 
				-      </p>
			
 
				-        
			
 
				-      <p><code>hadoop.http.authentication.cookie.domain</code>: The domain to use for the HTTP 
			
 
				-      cookie that stores the authentication token. In order to authentiation to work 
			
 
				-      correctly across all nodes in the cluster the domain must be correctly set.
			
 
				-      There is no default value, the HTTP cookie will not have a domain working only
			
 
				-      with the hostname issuing the HTTP cookie.
			
 
				-      </p>
			
 
				-
			
 
				-      <p>
			
 
				-      IMPORTANT: when using IP addresses, browsers ignore cookies with domain settings.
			
 
				-      For this setting to work properly all nodes in the cluster must be configured
			
 
				-      to generate URLs with hostname.domain names on it.
			
 
				-      </p>
			
 
				-
			
 
				-      <p><code>hadoop.http.authentication.simple.anonymous.allowed</code>: Indicates if anonymous 
			
 
				-      requests are allowed when using 'simple' authentication. The default value is 
			
 
				-      <code>true</code>
			
 
				-      </p>
			
 
				-
			
 
				-      <p><code>hadoop.http.authentication.kerberos.principal</code>: Indicates the Kerberos 
			
 
				-      principal to be used for HTTP endpoint when using 'kerberos' authentication.
			
 
				-      The principal short name must be <code>HTTP</code> per Kerberos HTTP SPNEGO specification.
			
 
				-      The default value is <code>HTTP/_HOST@$LOCALHOST</code>, where <code>_HOST</code> -if present-
			
 
				-      is replaced with bind address of the HTTP server.
			
 
				-      </p>
			
 
				-
			
 
				-      <p><code>hadoop.http.authentication.kerberos.keytab</code>: Location of the keytab file 
			
 
				-      with the credentials for the Kerberos principal used for the HTTP endpoint. 
			
 
				-      The default value is <code>${user.home}/hadoop.keytab</code>.i
			
 
				-      </p>
			
 
				-
			
 
				-    </section>
			
 
				-
			
 
				-  </body>
			
 
				-</document>
			
 
				-
			
--- a/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/cluster_setup.xml
+++ b/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/cluster_setup.xml
@@ -1,1485 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-<document>
			
 
				-  
			
 
				-  <header>
			
 
				-    <title>Cluster Setup</title>
			
 
				-  </header>
			
 
				-  
			
 
				-  <body>
			
 
				-  
			
 
				-    <section>
			
 
				-      <title>Purpose</title>
			
 
				-      
			
 
				-      <p>This document describes how to install, configure and manage non-trivial
			
 
				-      Hadoop clusters ranging from a few nodes to extremely large clusters with 
			
 
				-      thousands of nodes.</p>
			
 
				-      <p>
			
 
				-      To play with Hadoop, you may first want to install Hadoop on a single machine (see <a href="single_node_setup.html"> Hadoop Quick Start</a>).
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Pre-requisites</title>
			
 
				-      
			
 
				-      <ol>
			
 
				-        <li>
			
 
				-          Make sure all <a href="single_node_setup.html#PreReqs">requisite</a> software 
			
 
				-          is installed on all nodes in your cluster.
			
 
				-        </li>
			
 
				-        <li>
			
 
				-          <a href="single_node_setup.html#Download">Get</a> the Hadoop software.
			
 
				-        </li>
			
 
				-      </ol>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Installation</title>
			
 
				-      
			
 
				-      <p>Installing a Hadoop cluster typically involves unpacking the software 
			
 
				-      on all the machines in the cluster.</p>
			
 
				-      
			
 
				-      <p>Typically one machine in the cluster is designated as the 
			
 
				-      <code>NameNode</code> and another machine the as <code>JobTracker</code>,
			
 
				-      exclusively. These are the <em>masters</em>. The rest of the machines in 
			
 
				-      the cluster act as both <code>DataNode</code> <em>and</em> 
			
 
				-      <code>TaskTracker</code>. These are the <em>slaves</em>.</p>
			
 
				-      
			
 
				-      <p>The root of the distribution is referred to as 
			
 
				-      <code>HADOOP_PREFIX</code>. All machines in the cluster usually have the same 
			
 
				-      <code>HADOOP_PREFIX</code> path.</p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Configuration</title>
			
 
				-      
			
 
				-      <p>The following sections describe how to configure a Hadoop cluster.</p>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Configuration Files</title>
			
 
				-        
			
 
				-        <p>Hadoop configuration is driven by two types of important 
			
 
				-        configuration files:</p>
			
 
				-        <ol>
			
 
				-          <li>
			
 
				-            Read-only default configuration - 
			
 
				-            <a href="ext:core-default">src/core/core-default.xml</a>, 
			
 
				-            <a href="ext:hdfs-default">src/hdfs/hdfs-default.xml</a>, 
			
 
				-            <a href="ext:mapred-default">src/mapred/mapred-default.xml</a> and
			
 
				-            <a href="ext:mapred-queues">conf/mapred-queues.xml.template</a>.
			
 
				-          </li>
			
 
				-          <li>
			
 
				-            Site-specific configuration - 
			
 
				-            <a href="#core-site.xml">conf/core-site.xml</a>, 
			
 
				-            <a href="#hdfs-site.xml">conf/hdfs-site.xml</a>, 
			
 
				-            <a href="#mapred-site.xml">conf/mapred-site.xml</a> and
			
 
				-            <a href="#mapred-queues.xml">conf/mapred-queues.xml</a>.
			
 
				-          </li>
			
 
				-        </ol>
			
 
				-      
			
 
				-        <p>To learn more about how the Hadoop framework is controlled by these 
			
 
				-        configuration files, look 
			
 
				-        <a href="ext:api/org/apache/hadoop/conf/configuration">here</a>.</p>
			
 
				-      
			
 
				-        <p>Additionally, you can control the Hadoop scripts found in the 
			
 
				-        <code>bin/</code> directory of the distribution, by setting site-specific 
			
 
				-        values via the <code>conf/hadoop-env.sh</code>.</p>
			
 
				-      </section>
			
 
				-      
			
 
				-      <section>
			
 
				-        <title>Site Configuration</title>
			
 
				-        
			
 
				-        <p>To configure the Hadoop cluster you will need to configure the
			
 
				-        <em>environment</em> in which the Hadoop daemons execute as well as
			
 
				-        the <em>configuration parameters</em> for the Hadoop daemons.</p>
			
 
				-        
			
 
				-        <p>The Hadoop daemons are <code>NameNode</code>/<code>DataNode</code> 
			
 
				-        and <code>JobTracker</code>/<code>TaskTracker</code>.</p>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Configuring the Environment of the Hadoop Daemons</title>
			
 
				-
			
 
				-          <p>Administrators should use the <code>conf/hadoop-env.sh</code> script
			
 
				-          to do site-specific customization of the Hadoop daemons' process 
			
 
				-          environment.</p> 
			
 
				-          
			
 
				-          <p>At the very least you should specify the
			
 
				-          <code>JAVA_HOME</code> so that it is correctly defined on each
			
 
				-          remote node.</p>
			
 
				-          
			
 
				-          <p>Administrators can configure individual daemons using the
			
 
				-          configuration options <code>HADOOP_*_OPTS</code>. Various options 
			
 
				-          available are shown below in the table. </p>
			
 
				-          <table>
			
 
				-          <tr><th>Daemon</th><th>Configure Options</th></tr>
			
 
				-          <tr><td>NameNode</td><td>HADOOP_NAMENODE_OPTS</td></tr>
			
 
				-          <tr><td>DataNode</td><td>HADOOP_DATANODE_OPTS</td></tr>
			
 
				-          <tr><td>SecondaryNamenode</td>
			
 
				-              <td>HADOOP_SECONDARYNAMENODE_OPTS</td></tr>
			
 
				-          </table>
			
 
				-          
			
 
				-          <p> For example, To configure Namenode to use parallelGC, the
			
 
				-          following statement should be added in <code>hadoop-env.sh</code> :
			
 
				-          <br/><code>
			
 
				-          export HADOOP_NAMENODE_OPTS="-XX:+UseParallelGC ${HADOOP_NAMENODE_OPTS}"
			
 
				-          </code><br/></p>
			
 
				-          
			
 
				-          <p>Other useful configuration parameters that you can customize 
			
 
				-          include:</p>
			
 
				-          <ul>
			
 
				-            <li>
			
 
				-              <code>HADOOP_LOG_DIR</code> - The directory where the daemons'
			
 
				-              log files are stored. They are automatically created if they don't
			
 
				-              exist.
			
 
				-            </li>
			
 
				-            <li>
			
 
				-              <code>HADOOP_HEAPSIZE</code> - The maximum amount of heapsize 
			
 
				-              to use, in MB e.g. <code>1000MB</code>. This is used to 
			
 
				-              configure the heap size for the hadoop daemon. By default,
			
 
				-              the value is <code>1000MB</code>.
			
 
				-            </li>
			
 
				-          </ul>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Configuring the Hadoop Daemons</title>
			
 
				-          
			
 
				-          <p>This section deals with important parameters to be specified in the
			
 
				-          following:</p>
			
 
				-          <anchor id="core-site.xml"/><p><code>conf/core-site.xml</code>:</p>
			
 
				-
			
 
				-		  <table>
			
 
				-  		    <tr>
			
 
				-		      <th>Parameter</th>
			
 
				-		      <th>Value</th> 
			
 
				-		      <th>Notes</th>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-              <td>fs.defaultFS</td>
			
 
				-              <td>URI of <code>NameNode</code>.</td>
			
 
				-              <td><em>hdfs://hostname/</em></td>
			
 
				-            </tr>
			
 
				-          </table>
			
 
				-
			
 
				-      <anchor id="hdfs-site.xml"/><p><code>conf/hdfs-site.xml</code>:</p>
			
 
				-          
			
 
				-      <table>   
			
 
				-        <tr>
			
 
				-          <th>Parameter</th>
			
 
				-          <th>Value</th> 
			
 
				-          <th>Notes</th>
			
 
				-        </tr>
			
 
				-		    <tr>
			
 
				-		      <td>dfs.namenode.name.dir</td>
			
 
				-		      <td>
			
 
				-		        Path on the local filesystem where the <code>NameNode</code> 
			
 
				-		        stores the namespace and transactions logs persistently.</td>
			
 
				-		      <td>
			
 
				-		        If this is a comma-delimited list of directories then the name 
			
 
				-		        table is replicated in all of the directories, for redundancy.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>dfs.datanode.data.dir</td>
			
 
				-		      <td>
			
 
				-		        Comma separated list of paths on the local filesystem of a 
			
 
				-		        <code>DataNode</code> where it should store its blocks.
			
 
				-		      </td>
			
 
				-		      <td>
			
 
				-		        If this is a comma-delimited list of directories, then data will 
			
 
				-		        be stored in all named directories, typically on different 
			
 
				-		        devices.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-      </table>
			
 
				-
			
 
				-      <anchor id="mapred-site.xml"/><p><code>conf/mapred-site.xml</code>:</p>
			
 
				-
			
 
				-      <table>
			
 
				-          <tr>
			
 
				-          <th>Parameter</th>
			
 
				-          <th>Value</th> 
			
 
				-          <th>Notes</th>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td>mapreduce.jobtracker.address</td>
			
 
				-          <td>Host or IP and port of <code>JobTracker</code>.</td>
			
 
				-          <td><em>host:port</em> pair.</td>
			
 
				-        </tr>
			
 
				-		    <tr>
			
 
				-		      <td>mapreduce.jobtracker.system.dir</td>
			
 
				-		      <td>
			
 
				-		        Path on the HDFS where where the Map/Reduce framework stores 
			
 
				-		        system files e.g. <code>/hadoop/mapred/system/</code>.
			
 
				-		      </td>
			
 
				-		      <td>
			
 
				-		        This is in the default filesystem (HDFS) and must be accessible 
			
 
				-		        from both the server and client machines.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>mapreduce.cluster.local.dir</td>
			
 
				-		      <td>
			
 
				-		        Comma-separated list of paths on the local filesystem where 
			
 
				-		        temporary Map/Reduce data is written.
			
 
				-		      </td>
			
 
				-		      <td>Multiple paths help spread disk i/o.</td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>mapred.tasktracker.{map|reduce}.tasks.maximum</td>
			
 
				-		      <td>
			
 
				-		        The maximum number of Map/Reduce tasks, which are run 
			
 
				-		        simultaneously on a given <code>TaskTracker</code>, individually.
			
 
				-		      </td>
			
 
				-		      <td>
			
 
				-		        Defaults to 2 (2 maps and 2 reduces), but vary it depending on 
			
 
				-		        your hardware.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>dfs.hosts/dfs.hosts.exclude</td>
			
 
				-		      <td>List of permitted/excluded DataNodes.</td>
			
 
				-		      <td>
			
 
				-		        If necessary, use these files to control the list of allowable 
			
 
				-		        datanodes.
			
 
				-		      </td>
			
 
				-		    </tr>
			
 
				-		    <tr>
			
 
				-		      <td>mapreduce.jobtracker.hosts.filename/mapreduce.jobtracker.hosts.exclude.filename</td>
			
 
				-		      <td>List of permitted/excluded TaskTrackers.</td>
			
 
				-		      <td>
			
 
				-		        If necessary, use these files to control the list of allowable 
			
 
				-		        TaskTrackers.
			
 
				-		      </td>
			
 
				-      </tr>
			
 
				-      <tr>
			
 
				-        <td>mapreduce.cluster.acls.enabled</td>
			
 
				-        <td>Boolean, specifying whether checks for queue ACLs and job ACLs
			
 
				-            are to be done for authorizing users for doing queue operations and
			
 
				-            job operations.
			
 
				-        </td>
			
 
				-        <td>
			
 
				-          If <em>true</em>, queue ACLs are checked while submitting
			
 
				-          and administering jobs and job ACLs are checked for authorizing
			
 
				-          view and modification of jobs. Queue ACLs are specified using the
			
 
				-          configuration parameters of the form defined below under
			
 
				-          mapred-queues.xml. Job ACLs are described at
			
 
				-          mapred-tutorial in "Job Authorization" section.
			
 
				-          For enabling this flag(mapreduce.cluster.acls.enabled), this is to be
			
 
				-          set to true in mapred-site.xml on JobTracker node and on all
			
 
				-          TaskTracker nodes.
			
 
				-        </td>
			
 
				-      </tr>
			
 
				-  		    
			
 
				-		  </table>      
			
 
				-
			
 
				-          <p>Typically all the above parameters are marked as 
			
 
				-          <a href="ext:api/org/apache/hadoop/conf/configuration/final_parameters">
			
 
				-          final</a> to ensure that they cannot be overriden by user-applications.
			
 
				-          </p>
			
 
				-
			
 
				-          <anchor id="mapred-queues.xml"/><p><code>conf/mapred-queues.xml
			
 
				-          </code>:</p>
			
 
				-          <p>This file is used to configure the queues in the Map/Reduce
			
 
				-          system. Queues are abstract entities in the JobTracker that can be
			
 
				-          used to manage collections of jobs. They provide a way for 
			
 
				-          administrators to organize jobs in specific ways and to enforce 
			
 
				-          certain policies on such collections, thus providing varying
			
 
				-          levels of administrative control and management functions on jobs.
			
 
				-          </p> 
			
 
				-          <p>One can imagine the following sample scenarios:</p>
			
 
				-          <ul>
			
 
				-            <li> Jobs submitted by a particular group of users can all be 
			
 
				-            submitted to one queue. </li> 
			
 
				-            <li> Long running jobs in an organization can be submitted to a
			
 
				-            queue. </li>
			
 
				-            <li> Short running jobs can be submitted to a queue and the number
			
 
				-            of jobs that can run concurrently can be restricted. </li> 
			
 
				-          </ul> 
			
 
				-          <p>The usage of queues is closely tied to the scheduler configured
			
 
				-          at the JobTracker via <em>mapreduce.jobtracker.taskscheduler</em>.
			
 
				-          The degree of support of queues depends on the scheduler used. Some
			
 
				-          schedulers support a single queue, while others support more complex
			
 
				-          configurations. Schedulers also implement the policies that apply 
			
 
				-          to jobs in a queue. Some schedulers, such as the Fairshare scheduler,
			
 
				-          implement their own mechanisms for collections of jobs and do not rely
			
 
				-          on queues provided by the framework. The administrators are 
			
 
				-          encouraged to refer to the documentation of the scheduler they are
			
 
				-          interested in for determining the level of support for queues.</p>
			
 
				-          <p>The Map/Reduce framework supports some basic operations on queues
			
 
				-          such as job submission to a specific queue, access control for queues,
			
 
				-          queue states, viewing configured queues and their properties
			
 
				-          and refresh of queue properties. In order to fully implement some of
			
 
				-          these operations, the framework takes the help of the configured
			
 
				-          scheduler.</p>
			
 
				-          <p>The following types of queue configurations are possible:</p>
			
 
				-          <ul>
			
 
				-            <li> Single queue: The default configuration in Map/Reduce comprises
			
 
				-            of a single queue, as supported by the default scheduler. All jobs
			
 
				-            are submitted to this default queue which maintains jobs in a priority
			
 
				-            based FIFO order.</li>
			
 
				-            <li> Multiple single level queues: Multiple queues are defined, and
			
 
				-            jobs can be submitted to any of these queues. Different policies
			
 
				-            can be applied to these queues by schedulers that support this 
			
 
				-            configuration to provide a better level of support. For example,
			
 
				-            the <a href="ext:capacity-scheduler">capacity scheduler</a>
			
 
				-            provides ways of configuring different 
			
 
				-            capacity and fairness guarantees on these queues.</li>
			
 
				-            <li> Hierarchical queues: Hierarchical queues are a configuration in
			
 
				-            which queues can contain other queues within them recursively. The
			
 
				-            queues that contain other queues are referred to as 
			
 
				-            container queues. Queues that do not contain other queues are 
			
 
				-            referred as leaf or job queues. Jobs can only be submitted to leaf
			
 
				-            queues. Hierarchical queues can potentially offer a higher level 
			
 
				-            of control to administrators, as schedulers can now build a
			
 
				-            hierarchy of policies where policies applicable to a container
			
 
				-            queue can provide context for policies applicable to queues it
			
 
				-            contains. It also opens up possibilities for delegating queue
			
 
				-            administration where administration of queues in a container queue
			
 
				-            can be turned over to a different set of administrators, within
			
 
				-            the context provided by the container queue. For example, the
			
 
				-            <a href="ext:capacity-scheduler">capacity scheduler</a>
			
 
				-            uses hierarchical queues to partition capacity of a cluster
			
 
				-            among container queues, and allowing queues they contain to divide
			
 
				-            that capacity in more ways.</li> 
			
 
				-          </ul>
			
 
				-
			
 
				-          <p>Most of the configuration of the queues can be refreshed/reloaded
			
 
				-          without restarting the Map/Reduce sub-system by editing this
			
 
				-          configuration file as described in the section on
			
 
				-          <a href="commands_manual.html#RefreshQueues">reloading queue 
			
 
				-          configuration</a>.
			
 
				-          Not all configuration properties can be reloaded of course,
			
 
				-          as will description of each property below explain.</p>
			
 
				-
			
 
				-          <p>The format of conf/mapred-queues.xml is different from the other 
			
 
				-          configuration files, supporting nested configuration
			
 
				-          elements to support hierarchical queues. The format is as follows:
			
 
				-          </p>
			
 
				-
			
 
				-          <source>
			
 
				-          &lt;queues&gt;
			
 
				-            &lt;queue&gt;
			
 
				-              &lt;name&gt;$queue-name&lt;/name&gt;
			
 
				-              &lt;state&gt;$state&lt;/state&gt;
			
 
				-              &lt;queue&gt;
			
 
				-                &lt;name&gt;$child-queue1&lt;/name&gt;
			
 
				-                &lt;properties&gt;
			
 
				-                   &lt;property key="$key" value="$value"/&gt;
			
 
				-                   ...
			
 
				-                &lt;/properties&gt;
			
 
				-                &lt;queue&gt;
			
 
				-                  &lt;name&gt;$grand-child-queue1&lt;/name&gt;
			
 
				-                  ...
			
 
				-                &lt;/queue&gt;
			
 
				-              &lt;/queue&gt;
			
 
				-              &lt;queue&gt;
			
 
				-                &lt;name&gt;$child-queue2&lt;/name&gt;
			
 
				-                ...
			
 
				-              &lt;/queue&gt;
			
 
				-              ...
			
 
				-              ...
			
 
				-              ...
			
 
				-              &lt;queue&gt;
			
 
				-                &lt;name&gt;$leaf-queue&lt;/name&gt;
			
 
				-                &lt;acl-submit-job&gt;$acls&lt;/acl-submit-job&gt;
			
 
				-                &lt;acl-administer-jobs&gt;$acls&lt;/acl-administer-jobs&gt;
			
 
				-                &lt;properties&gt;
			
 
				-                   &lt;property key="$key" value="$value"/&gt;
			
 
				-                   ...
			
 
				-                &lt;/properties&gt;
			
 
				-              &lt;/queue&gt;
			
 
				-            &lt;/queue&gt;
			
 
				-          &lt;/queues&gt;
			
 
				-          </source>
			
 
				-          <table>
			
 
				-            <tr>
			
 
				-              <th>Tag/Attribute</th>
			
 
				-              <th>Value</th>
			
 
				-              <th>
			
 
				-              	<a href="commands_manual.html#RefreshQueues">Refresh-able?</a>
			
 
				-              </th>
			
 
				-              <th>Notes</th>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td><anchor id="queues_tag"/>queues</td>
			
 
				-              <td>Root element of the configuration file.</td>
			
 
				-              <td>Not-applicable</td>
			
 
				-              <td>All the queues are nested inside this root element of the
			
 
				-              file. There can be only one root queues element in the file.</td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td>aclsEnabled</td>
			
 
				-              <td>Boolean attribute to the
			
 
				-              <a href="#queues_tag"><em>&lt;queues&gt;</em></a> tag
			
 
				-              specifying whether ACLs are supported for controlling job
			
 
				-              submission and administration for <em>all</em> the queues
			
 
				-              configured.
			
 
				-              </td>
			
 
				-              <td>Yes</td>
			
 
				-              <td>If <em>false</em>, ACLs are ignored for <em>all</em> the
			
 
				-              configured queues. <br/><br/>
			
 
				-              If <em>true</em>, the user and group details of the user
			
 
				-              are checked against the configured ACLs of the corresponding
			
 
				-              job-queue while submitting and administering jobs. ACLs can be
			
 
				-              specified for each queue using the queue-specific tags
			
 
				-              "acl-$acl_name", defined below. ACLs are checked only against
			
 
				-              the job-queues, i.e. the leaf-level queues; ACLs configured
			
 
				-              for the rest of the queues in the hierarchy are ignored.
			
 
				-              </td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td><anchor id="queue_tag"/>queue</td>
			
 
				-              <td>A child element of the
			
 
				-              <a href="#queues_tag"><em>&lt;queues&gt;</em></a> tag or another
			
 
				-              <a href="#queue_tag"><em>&lt;queue&gt;</em></a>. Denotes a queue
			
 
				-              in the system.
			
 
				-              </td>
			
 
				-              <td>Not applicable</td>
			
 
				-              <td>Queues can be hierarchical and so this element can contain
			
 
				-              children of this same type.</td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td>name</td>
			
 
				-              <td>Child element of a 
			
 
				-              <a href="#queue_tag"><em>&lt;queue&gt;</em></a> specifying the
			
 
				-              name of the queue.</td>
			
 
				-              <td>No</td>
			
 
				-              <td>Name of the queue cannot contain the character <em>":"</em>
			
 
				-              which is reserved as the queue-name delimiter when addressing a
			
 
				-              queue in a hierarchy.</td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td>state</td>
			
 
				-              <td>Child element of a
			
 
				-              <a href="#queue_tag"><em>&lt;queue&gt;</em></a> specifying the
			
 
				-              state of the queue.
			
 
				-              </td>
			
 
				-              <td>Yes</td>
			
 
				-              <td>Each queue has a corresponding state. A queue in
			
 
				-              <em>'running'</em> state can accept new jobs, while a queue in
			
 
				-              <em>'stopped'</em> state will stop accepting any new jobs. State
			
 
				-              is defined and respected by the framework only for the
			
 
				-              leaf-level queues and is ignored for all other queues.
			
 
				-              <br/><br/>
			
 
				-              The state of the queue can be viewed from the command line using
			
 
				-              <code>'bin/mapred queue'</code> command and also on the the Web
			
 
				-              UI.<br/><br/>
			
 
				-              Administrators can stop and start queues at runtime using the
			
 
				-              feature of <a href="commands_manual.html#RefreshQueues">reloading
			
 
				-              queue configuration</a>. If a queue is stopped at runtime, it
			
 
				-              will complete all the existing running jobs and will stop
			
 
				-              accepting any new jobs.
			
 
				-              </td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td>acl-submit-job</td>
			
 
				-              <td>Child element of a
			
 
				-              <a href="#queue_tag"><em>&lt;queue&gt;</em></a> specifying the
			
 
				-              list of users and groups that can submit jobs to the specified
			
 
				-              queue.</td>
			
 
				-              <td>Yes</td>
			
 
				-              <td>
			
 
				-              Applicable only to leaf-queues.<br/><br/>
			
 
				-              The list of users and groups are both comma separated
			
 
				-              list of names. The two lists are separated by a blank.
			
 
				-              Example: <em>user1,user2 group1,group2</em>.
			
 
				-              If you wish to define only a list of groups, provide
			
 
				-              a blank at the beginning of the value.
			
 
				-              <br/><br/>
			
 
				-              </td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td>acl-administer-jobs</td>
			
 
				-              <td>Child element of a
			
 
				-              <a href="#queue_tag"><em>&lt;queue&gt;</em></a> specifying the
			
 
				-              list of users and groups that can view job details, change the
			
 
				-              priority of a job or kill a job that has been submitted to the
			
 
				-              specified queue.
			
 
				-              </td>
			
 
				-              <td>Yes</td>
			
 
				-              <td>
			
 
				-              Applicable only to leaf-queues.<br/><br/>
			
 
				-              The list of users and groups are both comma separated
			
 
				-              list of names. The two lists are separated by a blank.
			
 
				-              Example: <em>user1,user2 group1,group2</em>.
			
 
				-              If you wish to define only a list of groups, provide
			
 
				-              a blank at the beginning of the value. Note that the
			
 
				-              owner of a job can always change the priority or kill
			
 
				-              his/her own job, irrespective of the ACLs.
			
 
				-              </td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td><anchor id="properties_tag"/>properties</td>
			
 
				-              <td>Child element of a 
			
 
				-              <a href="#queue_tag"><em>&lt;queue&gt;</em></a> specifying the
			
 
				-              scheduler specific properties.</td>
			
 
				-              <td>Not applicable</td>
			
 
				-              <td>The scheduler specific properties are the children of this
			
 
				-              element specified as a group of &lt;property&gt; tags described
			
 
				-              below. The JobTracker completely ignores these properties. These
			
 
				-              can be used as per-queue properties needed by the scheduler
			
 
				-              being configured. Please look at the scheduler specific
			
 
				-              documentation as to how these properties are used by that
			
 
				-              particular scheduler.
			
 
				-              </td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td><anchor id="property_tag"/>property</td>
			
 
				-              <td>Child element of
			
 
				-              <a href="#properties_tag"><em>&lt;properties&gt;</em></a> for a
			
 
				-              specific queue.</td>
			
 
				-              <td>Not applicable</td>
			
 
				-              <td>A single scheduler specific queue-property. Ignored by
			
 
				-              the JobTracker and used by the scheduler that is configured.</td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td>key</td>
			
 
				-              <td>Attribute of a
			
 
				-              <a href="#property_tag"><em>&lt;property&gt;</em></a> for a
			
 
				-              specific queue.</td>
			
 
				-              <td>Scheduler-specific</td>
			
 
				-              <td>The name of a single scheduler specific queue-property.</td>
			
 
				-            </tr>
			
 
				-
			
 
				-            <tr>
			
 
				-              <td>value</td>
			
 
				-              <td>Attribute of a
			
 
				-              <a href="#property_tag"><em>&lt;property&gt;</em></a> for a
			
 
				-              specific queue.</td>
			
 
				-              <td>Scheduler-specific</td>
			
 
				-              <td>The value of a single scheduler specific queue-property.
			
 
				-              The value can be anything that is left for the proper
			
 
				-              interpretation by the scheduler that is configured.</td>
			
 
				-            </tr>
			
 
				-
			
 
				-         </table>
			
 
				-
			
 
				-          <p>Once the queues are configured properly and the Map/Reduce
			
 
				-          system is up and running, from the command line one can
			
 
				-          <a href="commands_manual.html#QueuesList">get the list
			
 
				-          of queues</a> and
			
 
				-          <a href="commands_manual.html#QueuesInfo">obtain
			
 
				-          information specific to each queue</a>. This information is also
			
 
				-          available from the web UI. On the web UI, queue information can be
			
 
				-          seen by going to queueinfo.jsp, linked to from the queues table-cell
			
 
				-          in the cluster-summary table. The queueinfo.jsp prints the hierarchy
			
 
				-          of queues as well as the specific information for each queue.
			
 
				-          </p>
			
 
				-
			
 
				-          <p> Users can submit jobs only to a
			
 
				-          leaf-level queue by specifying the fully-qualified queue-name for
			
 
				-          the property name <em>mapreduce.job.queuename</em> in the job
			
 
				-          configuration. The character ':' is the queue-name delimiter and so,
			
 
				-          for e.g., if one wants to submit to a configured job-queue 'Queue-C'
			
 
				-          which is one of the sub-queues of 'Queue-B' which in-turn is a
			
 
				-          sub-queue of 'Queue-A', then the job configuration should contain
			
 
				-          property <em>mapreduce.job.queuename</em> set to the <em>
			
 
				-          &lt;value&gt;Queue-A:Queue-B:Queue-C&lt;/value&gt;</em></p>
			
 
				-         </section>
			
 
				-          <section>
			
 
				-            <title>Real-World Cluster Configurations</title>
			
 
				-            
			
 
				-            <p>This section lists some non-default configuration parameters which 
			
 
				-            have been used to run the <em>sort</em> benchmark on very large 
			
 
				-            clusters.</p>
			
 
				-            
			
 
				-            <ul>
			
 
				-              <li>
			
 
				-                <p>Some non-default configuration values used to run sort900,
			
 
				-                that is 9TB of data sorted on a cluster with 900 nodes:</p>
			
 
				-                <table>
			
 
				-  		          <tr>
			
 
				-                <th>Configuration File</th>
			
 
				-		            <th>Parameter</th>
			
 
				-		            <th>Value</th> 
			
 
				-		            <th>Notes</th>
			
 
				-		          </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/hdfs-site.xml</td>
			
 
				-                    <td>dfs.blocksize</td>
			
 
				-                    <td>128m</td>
			
 
				-                    <td>
			
 
				-                        HDFS blocksize of 128 MB for large file-systems. Sizes can be provided
			
 
				-                        in size-prefixed values (10k, 128m, 1g, etc.) or simply in bytes (134217728 for 128 MB, etc.).
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/hdfs-site.xml</td>
			
 
				-                    <td>dfs.namenode.handler.count</td>
			
 
				-                    <td>40</td>
			
 
				-                    <td>
			
 
				-                      More NameNode server threads to handle RPCs from large 
			
 
				-                      number of DataNodes.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.reduce.shuffle.parallelcopies</td>
			
 
				-                    <td>20</td>
			
 
				-                    <td>
			
 
				-                      Higher number of parallel copies run by reduces to fetch
			
 
				-                      outputs from very large number of maps.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.map.java.opts</td>
			
 
				-                    <td>-Xmx512M</td>
			
 
				-                    <td>
			
 
				-                      Larger heap-size for child jvms of maps. 
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.reduce.java.opts</td>
			
 
				-                    <td>-Xmx512M</td>
			
 
				-                    <td>
			
 
				-                      Larger heap-size for child jvms of reduces. 
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.reduce.shuffle.input.buffer.percent</td>
			
 
				-                    <td>0.80</td>
			
 
				-                    <td>
			
 
				-                      Larger amount of memory allocated for merging map output
			
 
				-                      in memory during the shuffle. Expressed as a fraction of
			
 
				-                      the total heap.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.reduce.input.buffer.percent</td>
			
 
				-                    <td>0.80</td>
			
 
				-                    <td>
			
 
				-                      Larger amount of memory allocated for retaining map output
			
 
				-                      in memory during the reduce. Expressed as a fraction of
			
 
				-                      the total heap.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.task.io.sort.factor</td>
			
 
				-                    <td>100</td>
			
 
				-                    <td>More streams merged at once while sorting files.</td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.task.io.sort.mb</td>
			
 
				-                    <td>200</td>
			
 
				-                    <td>Higher memory-limit while sorting data.</td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/core-site.xml</td>
			
 
				-                    <td>io.file.buffer.size</td>
			
 
				-                    <td>131072</td>
			
 
				-                    <td>Size of read/write buffer used in SequenceFiles.</td>
			
 
				-                  </tr>
			
 
				-                </table>
			
 
				-              </li>
			
 
				-              <li>
			
 
				-                <p>Updates to some configuration values to run sort1400 and 
			
 
				-                sort2000, that is 14TB of data sorted on 1400 nodes and 20TB of
			
 
				-                data sorted on 2000 nodes:</p>
			
 
				-                <table>
			
 
				-  		          <tr>
			
 
				-                <th>Configuration File</th>
			
 
				-		            <th>Parameter</th>
			
 
				-		            <th>Value</th> 
			
 
				-		            <th>Notes</th>
			
 
				-		          </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.jobtracker.handler.count</td>
			
 
				-                    <td>60</td>
			
 
				-                    <td>
			
 
				-                      More JobTracker server threads to handle RPCs from large 
			
 
				-                      number of TaskTrackers.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.reduce.shuffle.parallelcopies</td>
			
 
				-                    <td>50</td>
			
 
				-                    <td></td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.tasktracker.http.threads</td>
			
 
				-                    <td>50</td>
			
 
				-                    <td>
			
 
				-                      More worker threads for the TaskTracker's http server. The
			
 
				-                      http server is used by reduces to fetch intermediate 
			
 
				-                      map-outputs.
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.map.java.opts</td>
			
 
				-                    <td>-Xmx512M</td>
			
 
				-                    <td>
			
 
				-                      Larger heap-size for child jvms of maps. 
			
 
				-                    </td>
			
 
				-                  </tr>
			
 
				-                  <tr>
			
 
				-                    <td>conf/mapred-site.xml</td>
			
 
				-                    <td>mapreduce.reduce.java.opts</td>
			
 
				-                    <td>-Xmx1024M</td>
			
 
				-                    <td>Larger heap-size for child jvms of reduces.</td>
			
 
				-                  </tr>
			
 
				-                </table>
			
 
				-              </li>
			
 
				-            </ul>
			
 
				-          </section>
			
 
				-        <section>
			
 
				-        <title>Configuring Memory Parameters for MapReduce Jobs</title>
			
 
				-        <p>
			
 
				-        As MapReduce jobs could use varying amounts of memory, Hadoop
			
 
				-        provides various configuration options to users and administrators
			
 
				-        for managing memory effectively. Some of these options are job 
			
 
				-        specific and can be used by users. While setting up a cluster, 
			
 
				-        administrators can configure appropriate default values for these 
			
 
				-        options so that users jobs run out of the box. Other options are 
			
 
				-        cluster specific and can be used by administrators to enforce 
			
 
				-        limits and prevent misconfigured or memory intensive jobs from 
			
 
				-        causing undesired side effects on the cluster.
			
 
				-        </p>
			
 
				-        <p> 
			
 
				-        The values configured should
			
 
				-        take into account the hardware resources of the cluster, such as the
			
 
				-        amount of physical and virtual memory available for tasks,
			
 
				-        the number of slots configured on the slaves and the requirements
			
 
				-        for other processes running on the slaves. If right values are not
			
 
				-        set, it is likely that jobs start failing with memory related
			
 
				-        errors or in the worst case, even affect other tasks or
			
 
				-        the slaves themselves.
			
 
				-        </p>
			
 
				-
			
 
				-        <section>
			
 
				-          <title>Monitoring Task Memory Usage</title>
			
 
				-          <p>
			
 
				-          Before describing the memory options, it is
			
 
				-          useful to look at a feature provided by Hadoop to monitor
			
 
				-          memory usage of MapReduce tasks it runs. The basic objective
			
 
				-          of this feature is to prevent MapReduce tasks from consuming
			
 
				-          memory beyond a limit that would result in their affecting
			
 
				-          other processes running on the slave, including other tasks
			
 
				-          and daemons like the DataNode or TaskTracker.
			
 
				-          </p>
			
 
				-        
			
 
				-          <p>
			
 
				-          <em>Note:</em> For the time being, this feature is available
			
 
				-          only for the Linux platform.
			
 
				-          </p>
			
 
				-          
			
 
				-          <p>
			
 
				-          Hadoop allows monitoring to be done both for virtual
			
 
				-          and physical memory usage of tasks. This monitoring 
			
 
				-          can be done independently of each other, and therefore the
			
 
				-          options can be configured independently of each other. It
			
 
				-          has been found in some environments, particularly related
			
 
				-          to streaming, that virtual memory recorded for tasks is high
			
 
				-          because of libraries loaded by the programs used to run
			
 
				-          the tasks. However, this memory is largely unused and does
			
 
				-          not affect the slaves's memory itself. In such cases,
			
 
				-          monitoring based on physical memory can provide a more
			
 
				-          accurate picture of memory usage.
			
 
				-          </p>
			
 
				-          
			
 
				-          <p>
			
 
				-          This feature considers that there is a limit on
			
 
				-          the amount of virtual or physical memory on the slaves 
			
 
				-          that can be used by
			
 
				-          the running MapReduce tasks. The rest of the memory is
			
 
				-          assumed to be required for the system and other processes.
			
 
				-          Since some jobs may require higher amount of memory for their
			
 
				-          tasks than others, Hadoop allows jobs to specify how much
			
 
				-          memory they expect to use at a maximum. Then by using
			
 
				-          resource aware scheduling and monitoring, Hadoop tries to
			
 
				-          ensure that at any time, only enough tasks are running on
			
 
				-          the slaves as can meet the dual constraints of an individual
			
 
				-          job's memory requirements and the total amount of memory
			
 
				-          available for all MapReduce tasks.
			
 
				-          </p>
			
 
				-          
			
 
				-          <p>
			
 
				-          The TaskTracker monitors tasks in regular intervals. Each time,
			
 
				-          it operates in two steps:
			
 
				-          </p> 
			
 
				-          
			
 
				-          <ul>
			
 
				-          
			
 
				-          <li>
			
 
				-          In the first step, it
			
 
				-          checks that a job's task and any child processes it
			
 
				-          launches are not cumulatively using more virtual or physical 
			
 
				-          memory than specified. If both virtual and physical memory
			
 
				-          monitoring is enabled, then virtual memory usage is checked
			
 
				-          first, followed by physical memory usage. 
			
 
				-          Any task that is found to
			
 
				-          use more memory is killed along with any child processes it
			
 
				-          might have launched, and the task status is marked
			
 
				-          <em>failed</em>. Repeated failures such as this will terminate
			
 
				-          the job. 
			
 
				-          </li>
			
 
				-          
			
 
				-          <li>
			
 
				-          In the next step, it checks that the cumulative virtual and
			
 
				-          physical memory 
			
 
				-          used by all running tasks and their child processes
			
 
				-          does not exceed the total virtual and physical memory limit,
			
 
				-          respectively. Again, virtual memory limit is checked first, 
			
 
				-          followed by physical memory limit. In this case, it kills
			
 
				-          enough number of tasks, along with any child processes they
			
 
				-          might have launched, until the cumulative memory usage
			
 
				-          is brought under limit. In the case of virtual memory limit
			
 
				-          being exceeded, the tasks chosen for killing are
			
 
				-          the ones that have made the least progress. In the case of
			
 
				-          physical memory limit being exceeded, the tasks chosen
			
 
				-          for killing are the ones that have used the maximum amount
			
 
				-          of physical memory. Also, the status
			
 
				-          of these tasks is marked as <em>killed</em>, and hence repeated
			
 
				-          occurrence of this will not result in a job failure. 
			
 
				-          </li>
			
 
				-          
			
 
				-          </ul>
			
 
				-          
			
 
				-          <p>
			
 
				-          In either case, the task's diagnostic message will indicate the
			
 
				-          reason why the task was terminated.
			
 
				-          </p>
			
 
				-          
			
 
				-          <p>
			
 
				-          Resource aware scheduling can ensure that tasks are scheduled
			
 
				-          on a slave only if their memory requirement can be satisfied
			
 
				-          by the slave. The Capacity Scheduler, for example,
			
 
				-          takes virtual memory requirements into account while 
			
 
				-          scheduling tasks, as described in the section on 
			
 
				-          <a href="ext:capacity-scheduler/MemoryBasedTaskScheduling"> 
			
 
				-          memory based scheduling</a>.
			
 
				-          </p>
			
 
				- 
			
 
				-          <p>
			
 
				-          Memory monitoring is enabled when certain configuration 
			
 
				-          variables are defined with non-zero values, as described below.
			
 
				-          </p>
			
 
				-          
			
 
				-        </section>
			
 
				-
			
 
				-        <section>
			
 
				-          <title>Job Specific Options</title>
			
 
				-          <p>
			
 
				-          Memory related options that can be configured individually per 
			
 
				-          job are described in detail in the section on
			
 
				-          <a href="ext:mapred-tutorial/ConfiguringMemoryRequirements">
			
 
				-          Configuring Memory Requirements For A Job</a> in the MapReduce
			
 
				-          tutorial. While setting up
			
 
				-          the cluster, the Hadoop defaults for these options can be reviewed
			
 
				-          and changed to better suit the job profiles expected to be run on
			
 
				-          the clusters, as also the hardware configuration.
			
 
				-          </p>
			
 
				-          <p>
			
 
				-          As with any other configuration option in Hadoop, if the 
			
 
				-          administrators desire to prevent users from overriding these 
			
 
				-          options in jobs they submit, these values can be marked as
			
 
				-          <em>final</em> in the cluster configuration.
			
 
				-          </p>
			
 
				-        </section>
			
 
				-        
			
 
				-          
			
 
				-        <section>
			
 
				-          <title>Cluster Specific Options</title>
			
 
				-          
			
 
				-          <p>
			
 
				-          This section describes the memory related options that are
			
 
				-          used by the JobTracker and TaskTrackers, and cannot be changed 
			
 
				-          by jobs. The values set for these options should be the same
			
 
				-          for all the slave nodes in a cluster.
			
 
				-          </p>
			
 
				-          
			
 
				-          <ul>
			
 
				-          
			
 
				-          <li>
			
 
				-          <code>mapreduce.cluster.{map|reduce}memory.mb</code>: These
			
 
				-          options define the default amount of virtual memory that should be
			
 
				-          allocated for MapReduce tasks running in the cluster. They
			
 
				-          typically match the default values set for the options
			
 
				-          <code>mapreduce.{map|reduce}.memory.mb</code>. They help in the
			
 
				-          calculation of the total amount of virtual memory available for 
			
 
				-          MapReduce tasks on a slave, using the following equation:<br/>
			
 
				-          <em>Total virtual memory for all MapReduce tasks = 
			
 
				-          (mapreduce.cluster.mapmemory.mb * 
			
 
				-           mapreduce.tasktracker.map.tasks.maximum) +
			
 
				-          (mapreduce.cluster.reducememory.mb * 
			
 
				-           mapreduce.tasktracker.reduce.tasks.maximum)</em><br/>
			
 
				-          Typically, reduce tasks require more memory than map tasks.
			
 
				-          Hence a higher value is recommended for 
			
 
				-          <em>mapreduce.cluster.reducememory.mb</em>. The value is  
			
 
				-          specified in MB. To set a value of 2GB for reduce tasks, set
			
 
				-          <em>mapreduce.cluster.reducememory.mb</em> to 2048.
			
 
				-          </li>
			
 
				-
			
 
				-          <li>
			
 
				-          <code>mapreduce.jobtracker.max{map|reduce}memory.mb</code>:
			
 
				-          These options define the maximum amount of virtual memory that 
			
 
				-          can be requested by jobs using the parameters
			
 
				-          <code>mapreduce.{map|reduce}.memory.mb</code>. The system
			
 
				-          will reject any job that is submitted requesting for more
			
 
				-          memory than these limits. Typically, the values for these
			
 
				-          options should be set to satisfy the following constraint:<br/>
			
 
				-          <em>mapreduce.jobtracker.maxmapmemory.mb =
			
 
				-            mapreduce.cluster.mapmemory.mb * 
			
 
				-             mapreduce.tasktracker.map.tasks.maximum<br/>
			
 
				-              mapreduce.jobtracker.maxreducememory.mb =
			
 
				-            mapreduce.cluster.reducememory.mb * 
			
 
				-             mapreduce.tasktracker.reduce.tasks.maximum</em><br/>
			
 
				-          The value is specified in MB. If 
			
 
				-          <code>mapreduce.cluster.reducememory.mb</code> is set to 2GB and
			
 
				-          there are 2 reduce slots configured in the slaves, the value
			
 
				-          for <code>mapreduce.jobtracker.maxreducememory.mb</code> should 
			
 
				-          be set to 4096.
			
 
				-          </li>
			
 
				-          
			
 
				-          <li>
			
 
				-          <code>mapreduce.tasktracker.reserved.physicalmemory.mb</code>:
			
 
				-          This option defines the amount of physical memory that is
			
 
				-          marked for system and daemon processes. Using this, the amount
			
 
				-          of physical memory available for MapReduce tasks is calculated
			
 
				-          using the following equation:<br/>
			
 
				-          <em>Total physical memory for all MapReduce tasks = 
			
 
				-                Total physical memory available on the system -
			
 
				-                mapreduce.tasktracker.reserved.physicalmemory.mb</em><br/>
			
 
				-          The value is specified in MB. To set this value to 2GB, 
			
 
				-          specify the value as 2048.
			
 
				-          </li>
			
 
				-
			
 
				-          <li>
			
 
				-          <code>mapreduce.tasktracker.taskmemorymanager.monitoringinterval</code>:
			
 
				-          This option defines the time the TaskTracker waits between
			
 
				-          two cycles of memory monitoring. The value is specified in 
			
 
				-          milliseconds.
			
 
				-          </li>
			
 
				-          
			
 
				-          </ul>
			
 
				-          
			
 
				-          <p>
			
 
				-          <em>Note:</em> The virtual memory monitoring function is only 
			
 
				-          enabled if
			
 
				-          the variables <code>mapreduce.cluster.{map|reduce}memory.mb</code>
			
 
				-          and <code>mapreduce.jobtracker.max{map|reduce}memory.mb</code>
			
 
				-          are set to values greater than zero. Likewise, the physical
			
 
				-          memory monitoring function is only enabled if the variable
			
 
				-          <code>mapreduce.tasktracker.reserved.physicalmemory.mb</code>
			
 
				-          is set to a value greater than zero.
			
 
				-          </p>
			
 
				-        </section>
			
 
				-      </section>
			
 
				-      
			
 
				-        
			
 
				-          <section>
			
 
				-            <title>Task Controllers</title>
			
 
				-            <p>Task controllers are classes in the Hadoop Map/Reduce 
			
 
				-            framework that define how user's map and reduce tasks 
			
 
				-            are launched and controlled. They can 
			
 
				-            be used in clusters that require some customization in 
			
 
				-            the process of launching or controlling the user tasks.
			
 
				-            For example, in some 
			
 
				-            clusters, there may be a requirement to run tasks as 
			
 
				-            the user who submitted the job, instead of as the task 
			
 
				-            tracker user, which is how tasks are launched by default.
			
 
				-            This section describes how to configure and use 
			
 
				-            task controllers.</p>
			
 
				-            <p>The following task controllers are the available in
			
 
				-            Hadoop.
			
 
				-            </p>
			
 
				-            <table>
			
 
				-            <tr><th>Name</th><th>Class Name</th><th>Description</th></tr>
			
 
				-            <tr>
			
 
				-            <td>DefaultTaskController</td>
			
 
				-            <td>org.apache.hadoop.mapred.DefaultTaskController</td>
			
 
				-            <td> The default task controller which Hadoop uses to manage task
			
 
				-            execution. The tasks run as the task tracker user.</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-            <td>LinuxTaskController</td>
			
 
				-            <td>org.apache.hadoop.mapred.LinuxTaskController</td>
			
 
				-            <td>This task controller, which is supported only on Linux, 
			
 
				-            runs the tasks as the user who submitted the job. It requires
			
 
				-            these user accounts to be created on the cluster nodes 
			
 
				-            where the tasks are launched. It 
			
 
				-            uses a setuid executable that is included in the Hadoop
			
 
				-            distribution. The task tracker uses this executable to 
			
 
				-            launch and kill tasks. The setuid executable switches to
			
 
				-            the user who has submitted the job and launches or kills
			
 
				-            the tasks. For maximum security, this task controller 
			
 
				-            sets up restricted permissions and user/group ownership of
			
 
				-            local files and directories used by the tasks such as the
			
 
				-            job jar files, intermediate files, task log files and distributed
			
 
				-            cache files. Particularly note that, because of this, except the
			
 
				-            job owner and tasktracker, no other user can access any of the
			
 
				-            local files/directories including those localized as part of the
			
 
				-            distributed cache.
			
 
				-            </td>
			
 
				-            </tr>
			
 
				-            </table>
			
 
				-            <section>
			
 
				-            <title>Configuring Task Controllers</title>
			
 
				-            <p>The task controller to be used can be configured by setting the
			
 
				-            value of the following key in mapred-site.xml</p>
			
 
				-            <table>
			
 
				-            <tr>
			
 
				-            <th>Property</th><th>Value</th><th>Notes</th>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-            <td>mapreduce.tasktracker.taskcontroller</td>
			
 
				-            <td>Fully qualified class name of the task controller class</td>
			
 
				-            <td>Currently there are two implementations of task controller
			
 
				-            in the Hadoop system, DefaultTaskController and LinuxTaskController.
			
 
				-            Refer to the class names mentioned above to determine the value
			
 
				-            to set for the class of choice.
			
 
				-            </td>
			
 
				-            </tr>
			
 
				-            </table>
			
 
				-            </section>
			
 
				-            <section>
			
 
				-            <title>Using the LinuxTaskController</title>
			
 
				-            <p>This section of the document describes the steps required to
			
 
				-            use the LinuxTaskController.</p>
			
 
				-            
			
 
				-            <p>In order to use the LinuxTaskController, a setuid executable
			
 
				-            should be built and deployed on the compute nodes. The
			
 
				-            executable is named task-controller. To build the executable, 
			
 
				-            execute 
			
 
				-            <em>ant task-controller -Dhadoop.conf.dir=/path/to/conf/dir.
			
 
				-            </em>
			
 
				-            The path passed in <em>-Dhadoop.conf.dir</em> should be the path
			
 
				-            on the cluster nodes where a configuration file for the setuid
			
 
				-            executable would be located. The executable would be built to
			
 
				-            <em>build.dir/dist.dir/bin</em> and should be installed to 
			
 
				-            <em>$HADOOP_PREFIX/bin</em>.
			
 
				-            </p>
			
 
				-            
			
 
				-            <p>
			
 
				-            The executable must have specific permissions as follows. The
			
 
				-            executable should have <em>6050 or --Sr-s---</em> permissions
			
 
				-            user-owned by root(super-user) and group-owned by a special group 
			
 
				-            of which the TaskTracker's user is the group member and no job 
			
 
				-            submitter is. If any job submitter belongs to this special group,
			
 
				-            security will be compromised. This special group name should be
			
 
				-            specified for the configuration property 
			
 
				-            <em>"mapreduce.tasktracker.group"</em> in both mapred-site.xml and 
			
 
				-            <a href="#task-controller.cfg">task-controller.cfg</a>.  
			
 
				-            For example, let's say that the TaskTracker is run as user
			
 
				-            <em>mapred</em> who is part of the groups <em>users</em> and
			
 
				-            <em>specialGroup</em> any of them being the primary group.
			
 
				-            Let also be that <em>users</em> has both <em>mapred</em> and
			
 
				-            another user (job submitter) <em>X</em> as its members, and X does
			
 
				-            not belong to <em>specialGroup</em>. Going by the above
			
 
				-            description, the setuid/setgid executable should be set
			
 
				-            <em>6050 or --Sr-s---</em> with user-owner as <em>mapred</em> and
			
 
				-            group-owner as <em>specialGroup</em> which has
			
 
				-            <em>mapred</em> as its member(and not <em>users</em> which has
			
 
				-            <em>X</em> also as its member besides <em>mapred</em>).
			
 
				-            </p>
			
 
				-
			
 
				-            <p>
			
 
				-            The LinuxTaskController requires that paths including and leading up
			
 
				-            to the directories specified in
			
 
				-            <em>mapreduce.cluster.local.dir</em> and <em>hadoop.log.dir</em> to
			
 
				-            be set 755 permissions.
			
 
				-            </p>
			
 
				-            
			
 
				-            <section>
			
 
				-            <title>task-controller.cfg</title>
			
 
				-            <p>The executable requires a configuration file called 
			
 
				-            <em>taskcontroller.cfg</em> to be
			
 
				-            present in the configuration directory passed to the ant target 
			
 
				-            mentioned above. If the binary was not built with a specific 
			
 
				-            conf directory, the path defaults to
			
 
				-            <em>/path-to-binary/../conf</em>. The configuration file must be
			
 
				-            owned by the user running TaskTracker (user <em>mapred</em> in the
			
 
				-            above example), group-owned by anyone and should have the
			
 
				-            permissions <em>0400 or r--------</em>.
			
 
				-            </p>
			
 
				-            
			
 
				-            <p>The executable requires following configuration items to be 
			
 
				-            present in the <em>taskcontroller.cfg</em> file. The items should
			
 
				-            be mentioned as simple <em>key=value</em> pairs.
			
 
				-            </p>
			
 
				-            <table><tr><th>Name</th><th>Description</th></tr>
			
 
				-            <tr>
			
 
				-            <td>mapreduce.cluster.local.dir</td>
			
 
				-            <td>Path to mapreduce.cluster.local.directories. Should be same as the value 
			
 
				-            which was provided to key in mapred-site.xml. This is required to
			
 
				-            validate paths passed to the setuid executable in order to prevent
			
 
				-            arbitrary paths being passed to it.</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-            <td>hadoop.log.dir</td>
			
 
				-            <td>Path to hadoop log directory. Should be same as the value which
			
 
				-            the TaskTracker is started with. This is required to set proper
			
 
				-            permissions on the log files so that they can be written to by the user's
			
 
				-            tasks and read by the TaskTracker for serving on the web UI.</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-            <td>mapreduce.tasktracker.group</td>
			
 
				-            <td>Group to which the TaskTracker belongs. The group owner of the
			
 
				-            taskcontroller binary should be this group. Should be same as
			
 
				-            the value with which the TaskTracker is configured. This 
			
 
				-            configuration is required for validating the secure access of the
			
 
				-            task-controller binary.</td>
			
 
				-            </tr>
			
 
				-            </table>
			
 
				-            </section>
			
 
				-            </section>
			
 
				-            
			
 
				-          </section>
			
 
				-          <section>
			
 
				-            <title>Monitoring Health of TaskTracker Nodes</title>
			
 
				-            <p>Hadoop Map/Reduce provides a mechanism by which administrators 
			
 
				-            can configure the TaskTracker to run an administrator supplied
			
 
				-            script periodically to determine if a node is healthy or not.
			
 
				-            Administrators can determine if the node is in a healthy state
			
 
				-            by performing any checks of their choice in the script. If the
			
 
				-            script detects the node to be in an unhealthy state, it must print
			
 
				-            a line to standard output beginning with the string <em>ERROR</em>.
			
 
				-            The TaskTracker spawns the script periodically and checks its 
			
 
				-            output. If the script's output contains the string <em>ERROR</em>,
			
 
				-            as described above, the node's status is reported as 'unhealthy'
			
 
				-            and the node is black-listed on the JobTracker. No further tasks 
			
 
				-            will be assigned to this node. However, the
			
 
				-            TaskTracker continues to run the script, so that if the node
			
 
				-            becomes healthy again, it will be removed from the blacklisted
			
 
				-            nodes on the JobTracker automatically. The node's health
			
 
				-            along with the output of the script, if it is unhealthy, is
			
 
				-            available to the administrator in the JobTracker's web interface.
			
 
				-            The time since the node was healthy is also displayed on the 
			
 
				-            web interface.
			
 
				-            </p>
			
 
				-            
			
 
				-            <section>
			
 
				-            <title>Configuring the Node Health Check Script</title>
			
 
				-            <p>The following parameters can be used to control the node health 
			
 
				-            monitoring script in <em>mapred-site.xml</em>.</p>
			
 
				-            <table>
			
 
				-            <tr><th>Name</th><th>Description</th></tr>
			
 
				-            <tr><td><code>mapreduce.tasktracker.healthchecker.script.path</code></td>
			
 
				-            <td>Absolute path to the script which is periodically run by the 
			
 
				-            TaskTracker to determine if the node is 
			
 
				-            healthy or not. The file should be executable by the TaskTracker.
			
 
				-            If the value of this key is empty or the file does 
			
 
				-            not exist or is not executable, node health monitoring
			
 
				-            is not started.</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-            <td><code>mapreduce.tasktracker.healthchecker.interval</code></td>
			
 
				-            <td>Frequency at which the node health script is run, 
			
 
				-            in milliseconds</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-            <td><code>mapreduce.tasktracker.healthchecker.script.timeout</code></td>
			
 
				-            <td>Time after which the node health script will be killed by
			
 
				-            the TaskTracker if unresponsive.
			
 
				-            The node is marked unhealthy. if node health script times out.</td>
			
 
				-            </tr>
			
 
				-            <tr>
			
 
				-            <td><code>mapreduce.tasktracker.healthchecker.script.args</code></td>
			
 
				-            <td>Extra arguments that can be passed to the node health script 
			
 
				-            when launched.
			
 
				-            These should be comma separated list of arguments. </td>
			
 
				-            </tr>
			
 
				-            </table>
			
 
				-            </section>
			
 
				-          </section>
			
 
				-          
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Slaves</title>
			
 
				-          
			
 
				-          <p>Typically you choose one machine in the cluster to act as the 
			
 
				-          <code>NameNode</code> and one machine as to act as the 
			
 
				-          <code>JobTracker</code>, exclusively. The rest of the machines act as 
			
 
				-          both a <code>DataNode</code> and <code>TaskTracker</code> and are 
			
 
				-          referred to as <em>slaves</em>.</p>
			
 
				-          
			
 
				-          <p>List all slave hostnames or IP addresses in your 
			
 
				-          <code>conf/slaves</code> file, one per line.</p>
			
 
				-        </section>
			
 
				-        
			
 
				-        <section>
			
 
				-          <title>Logging</title>
			
 
				-          
			
 
				-          <p>Hadoop uses the <a href="http://logging.apache.org/log4j/">Apache 
			
 
				-          log4j</a> via the <a href="http://commons.apache.org/logging/">Apache 
			
 
				-          Commons Logging</a> framework for logging. Edit the 
			
 
				-          <code>conf/log4j.properties</code> file to customize the Hadoop 
			
 
				-          daemons' logging configuration (log-formats and so on).</p>
			
 
				-          
			
 
				-          <section>
			
 
				-            <title>History Logging</title>
			
 
				-            
			
 
				-            <p> The job history files are stored in central location 
			
 
				-            <code> mapreduce.jobtracker.jobhistory.location </code> which can be on DFS also,
			
 
				-            whose default value is <code>${HADOOP_LOG_DIR}/history</code>. 
			
 
				-            The history web UI is accessible from job tracker web UI.</p>
			
 
				-            
			
 
				-            <p> The history files are also logged to user specified directory
			
 
				-            <code>mapreduce.job.userhistorylocation</code> 
			
 
				-            which defaults to job output directory. The files are stored in
			
 
				-            "_logs/history/" in the specified directory. Hence, by default 
			
 
				-            they will be in "mapreduce.output.fileoutputformat.outputdir/_logs/history/". User can stop
			
 
				-            logging by giving the value <code>none</code> for 
			
 
				-            <code>mapreduce.job.userhistorylocation</code> </p>
			
 
				-            
			
 
				-            <p> User can view the history logs summary in specified directory 
			
 
				-            using the following command <br/>
			
 
				-            <code>$ bin/hadoop job -history output-dir</code><br/> 
			
 
				-            This command will print job details, failed and killed tip
			
 
				-            details. <br/>
			
 
				-            More details about the job such as successful tasks and 
			
 
				-            task attempts made for each task can be viewed using the  
			
 
				-            following command <br/>
			
 
				-            <code>$ bin/hadoop job -history all output-dir</code><br/></p> 
			
 
				-          </section>
			
 
				-        </section>
			
 
				-      
			
 
				-      <p>Once all the necessary configuration is complete, distribute the files
			
 
				-      to the <code>HADOOP_CONF_DIR</code> directory on all the machines, 
			
 
				-      typically <code>${HADOOP_PREFIX}/conf</code>.</p>
			
 
				-    </section>
			
 
				-    <section>
			
 
				-      <title>Cluster Restartability</title>
			
 
				-      <section>
			
 
				-        <title>Map/Reduce</title>
			
 
				-        <p>The job tracker restart can recover running jobs if 
			
 
				-        <code>mapreduce.jobtracker.restart.recover</code> is set true and 
			
 
				-        <a href="#Logging">JobHistory logging</a> is enabled. Also 
			
 
				-        <code>mapreduce.jobtracker.jobhistory.block.size</code> value should be 
			
 
				-        set to an optimal value to dump job history to disk as soon as 
			
 
				-        possible, the typical value is 3145728(3MB).</p>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Hadoop Rack Awareness</title>
			
 
				-      <p>
			
 
				-         Both HDFS and Map/Reduce components are rack-aware.  HDFS block placement will use rack 
			
 
				-         awareness for fault tolerance by placing one block replica on a different rack.  This provides 
			
 
				-         data availability in the event of a network switch failure within the cluster.  The jobtracker uses rack
			
 
				-         awareness to reduce network transfers of HDFS data blocks by attempting to schedule tasks on datanodes with a local
			
 
				-         copy of needed HDFS blocks.  If the tasks cannot be scheduled on the datanodes
			
 
				-         containing the needed HDFS blocks, then the tasks will be scheduled on the same rack to reduce network transfers if possible.
			
 
				-      </p>
			
 
				-      <p>The NameNode and the JobTracker obtain the rack id of the cluster slaves by invoking either 
			
 
				-         an external script or java class as specified by configuration files.  Using either the 
			
 
				-         java class or external script for topology, output must adhere to the java 
			
 
				-         <a href="ext:api/org/apache/hadoop/net/dnstoswitchmapping/resolve">DNSToSwitchMapping</a> 
			
 
				-         interface.  The interface expects a one-to-one correspondence to be maintained 
			
 
				-         and the topology information in the format of '/myrack/myhost', where '/' is the topology 
			
 
				-         delimiter, 'myrack' is the rack identifier, and 'myhost' is the individual host.  Assuming 
			
 
				-         a single /24 subnet per rack, one could use the format of '/192.168.100.0/192.168.100.5' as a 
			
 
				-         unique rack-host topology mapping.
			
 
				-      </p>
			
 
				-      <p>
			
 
				-         To use the java class for topology mapping, the class name is specified by the 
			
 
				-         <code>'topology.node.switch.mapping.impl'</code> parameter in the configuration file. 
			
 
				-         An example, NetworkTopology.java, is included with the hadoop distribution and can be customized 
			
 
				-         by the hadoop administrator.  If not included with your distribution, NetworkTopology.java can also be found in the Hadoop 
			
 
				-         <a href="http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java?view=markup">
			
 
				-         subversion tree</a>.  Using a java class instead of an external script has a slight performance benefit in 
			
 
				-         that it doesn't need to fork an external process when a new slave node registers itself with the jobtracker or namenode.  
			
 
				-         As this class is only used during slave node registration, the performance benefit is limited.  
			
 
				-      </p>
			
 
				-      <p>
			
 
				-         If implementing an external script, it will be specified with the
			
 
				-         <code>topology.script.file.name</code> parameter in the configuration files.  Unlike the java 
			
 
				-         class, the external topology script is not included with the Hadoop distribution and is provided by the 
			
 
				-         administrator.  Hadoop will send multiple IP addresses to ARGV when forking the topology script.  The  
			
 
				-         number of IP addresses sent to the topology script is controlled with <code>net.topology.script.number.args</code>
			
 
				-         and defaults to 100. If <code>net.topology.script.number.args</code> was changed to 1, a topology script would 
			
 
				-         get forked for each IP submitted by datanodes and/or tasktrackers.  Below are example topology scripts.
			
 
				-      </p>
			
 
				-      <section>
			
 
				-      <title>Python example</title>
			
 
				-      <source>
			
 
				-      <code>
			
 
				-      #!/usr/bin/python
			
 
				-
			
 
				-      # this script makes assumptions about the physical environment.
			
 
				-      #  1) each rack is its own layer 3 network with a /24 subnet, which could be typical where each rack has its own
			
 
				-      #     switch with uplinks to a central core router.
			
 
				-      #     
			
 
				-      #             +-----------+
			
 
				-      #             |core router|
			
 
				-      #             +-----------+
			
 
				-      #            /             \
			
 
				-      #   +-----------+        +-----------+
			
 
				-      #   |rack switch|        |rack switch|
			
 
				-      #   +-----------+        +-----------+
			
 
				-      #   | data node |        | data node |
			
 
				-      #   +-----------+        +-----------+
			
 
				-      #   | data node |        | data node |
			
 
				-      #   +-----------+        +-----------+
			
 
				-      #
			
 
				-      # 2) topology script gets list of IP's as input, calculates network address, and prints '/network_address/ip'.
			
 
				-
			
 
				-      import netaddr
			
 
				-      import sys             
			
 
				-      sys.argv.pop(0)                                                  # discard name of topology script from argv list as we just want IP addresses
			
 
				-
			
 
				-      netmask = '255.255.255.0'                                        # set netmask to what's being used in your environment.  The example uses a /24
			
 
				-
			
 
				-      for ip in sys.argv:                                              # loop over list of datanode IP's
			
 
				-          address = '{0}/{1}'.format(ip, netmask)                      # format address string so it looks like 'ip/netmask' to make netaddr work
			
 
				-          try:
			
 
				-              network_address = netaddr.IPNetwork(address).network     # calculate and print network address
			
 
				-              print "/{0}".format(network_address)                     
			
 
				-          except:
			
 
				-              print "/rack-unknown"                                    # print catch-all value if unable to calculate network address
			
 
				-
			
 
				-      </code>
			
 
				-      </source>
			
 
				-      </section>
			
 
				-          
			
 
				-      <section>
			
 
				-      <title>Bash  example</title>
			
 
				-      <source>
			
 
				-      <code>
			
 
				-      #!/bin/bash
			
 
				-      # Here's a bash example to show just how simple these scripts can be
			
 
				-      
			
 
				-      # Assuming we have flat network with everything on a single switch, we can fake a rack topology. 
			
 
				-      # This could occur in a lab environment where we have limited nodes,like 2-8 physical machines on a unmanaged switch. 
			
 
				-      # This may also apply to multiple virtual machines running on the same physical hardware.  
			
 
				-      # The number of machines isn't important, but that we are trying to fake a network topology when there isn't one. 
			
 
				-      #
			
 
				-      #       +----------+    +--------+
			
 
				-      #       |jobtracker|    |datanode| 
			
 
				-      #       +----------+    +--------+
			
 
				-      #              \        /
			
 
				-      #  +--------+  +--------+  +--------+
			
 
				-      #  |datanode|--| switch |--|datanode|
			
 
				-      #  +--------+  +--------+  +--------+
			
 
				-      #              /        \
			
 
				-      #       +--------+    +--------+
			
 
				-      #       |datanode|    |namenode| 
			
 
				-      #       +--------+    +--------+
			
 
				-      #
			
 
				-      # With this network topology, we are treating each host as a rack.  This is being done by taking the last octet 
			
 
				-      # in the datanode's IP and prepending it with the word '/rack-'.  The advantage for doing this is so HDFS
			
 
				-      # can create its 'off-rack' block copy.
			
 
				-      
			
 
				-      # 1) 'echo $@' will echo all ARGV values to xargs.  
			
 
				-      # 2) 'xargs' will enforce that we print a single argv value per line
			
 
				-      # 3) 'awk' will split fields on dots and append the last field to the string '/rack-'. If awk 
			
 
				-      #    fails to split on four dots, it will still print '/rack-' last field value
			
 
				-
			
 
				-      echo $@ | xargs -n 1 | awk -F '.' '{print "/rack-"$NF}'
			
 
				-
			
 
				-
			
 
				-      </code>
			
 
				-      </source>
			
 
				-      </section>
			
 
				-
			
 
				-
			
 
				-      <p>
			
 
				-         If <code>topology.script.file.name</code> or <code>topology.node.switch.mapping.impl</code> is 
			
 
				-         not set, the rack id '/default-rack' is returned for any passed IP address.  
			
 
				-         While this behavior appears desirable, it can cause issues with HDFS block replication as 
			
 
				-         default behavior is to write one replicated block off rack and is unable to do so as there is 
			
 
				-         only a single rack named '/default-rack'.
			
 
				-      </p>
			
 
				-      <p>
			
 
				-         An additional configuration setting is <code>mapred.cache.task.levels</code> which determines 
			
 
				-         the number of levels (in the network topology) of caches. So, for example, if it is the 
			
 
				-         default value of 2, two levels of caches will be constructed - one for hosts 
			
 
				-         (host -> task mapping) and another for racks (rack -> task mapping). Giving us our one-to-one 
			
 
				-          mapping of '/myrack/myhost'
			
 
				-      </p>
			
 
				-    </section>
			
 
				-    
			
 
				-    <section>
			
 
				-      <title>Hadoop Startup</title>
			
 
				-      
			
 
				-      <p>To start a Hadoop cluster you will need to start both the HDFS and 
			
 
				-      Map/Reduce cluster.</p>
			
 
				-
			
 
				-      <p>
			
 
				-        Format a new distributed filesystem:<br/>
			
 
				-        <code>$ bin/hadoop namenode -format</code>
			
 
				-      </p>
			
 
				-      
			
 
				-      <p>
			
 
				-        Start the HDFS with the following command, run on the designated
			
 
				-        <code>NameNode</code>:<br/>
			
 
				-        <code>$ bin/start-dfs.sh</code>
			
 
				-      </p>
			
 
				-      <p>The <code>bin/start-dfs.sh</code> script also consults the 
			
 
				-      <code>${HADOOP_CONF_DIR}/slaves</code> file on the <code>NameNode</code> 
			
 
				-      and starts the <code>DataNode</code> daemon on all the listed slaves.</p>
			
 
				-      
			
 
				-      <p>
			
 
				-        Start Map-Reduce with the following command, run on the designated
			
 
				-        <code>JobTracker</code>:<br/>
			
 
				-        <code>$ bin/start-mapred.sh</code>
			
 
				-      </p>
			
 
				-      <p>The <code>bin/start-mapred.sh</code> script also consults the 
			
 
				-      <code>${HADOOP_CONF_DIR}/slaves</code> file on the <code>JobTracker</code> 
			
 
				-      and starts the <code>TaskTracker</code> daemon on all the listed slaves.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title>Hadoop Shutdown</title>
			
 
				-      
			
 
				-      <p>
			
 
				-        Stop HDFS with the following command, run on the designated 
			
 
				-        <code>NameNode</code>:<br/>
			
 
				-        <code>$ bin/stop-dfs.sh</code>
			
 
				-      </p>
			
 
				-      <p>The <code>bin/stop-dfs.sh</code> script also consults the 
			
 
				-      <code>${HADOOP_CONF_DIR}/slaves</code> file on the <code>NameNode</code> 
			
 
				-      and stops the <code>DataNode</code> daemon on all the listed slaves.</p>
			
 
				-      
			
 
				-      <p>
			
 
				-        Stop Map/Reduce with the following command, run on the designated
			
 
				-        the designated <code>JobTracker</code>:<br/>
			
 
				-        <code>$ bin/stop-mapred.sh</code><br/>
			
 
				-      </p>
			
 
				-      <p>The <code>bin/stop-mapred.sh</code> script also consults the 
			
 
				-      <code>${HADOOP_CONF_DIR}/slaves</code> file on the <code>JobTracker</code> 
			
 
				-      and stops the <code>TaskTracker</code> daemon on all the listed slaves.</p>
			
 
				-    </section>
			
 
				-  </body>
			
 
				-  
			
 
				-</document>
			
--- a/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/commands_manual.xml
+++ b/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/commands_manual.xml
@@ -1,798 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-<document>
			
 
				-	<header>
			
 
				-		<title>Hadoop Commands Guide</title>
			
 
				-	</header>
			
 
				-	
			
 
				-	<body>
			
 
				-		<section>
			
 
				-			<title>Overview</title>
			
 
				-			<p>
			
 
				-				All Hadoop commands are invoked by the bin/hadoop script. Running the Hadoop
			
 
				-				script without any arguments prints the description for all commands.
			
 
				-			</p>
			
 
				-			<p>
			
 
				-				<code>Usage: hadoop [--config confdir] [COMMAND] [GENERIC_OPTIONS] [COMMAND_OPTIONS]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-				Hadoop has an option parsing framework that employs parsing generic options as well as running classes.
			
 
				-			</p>
			
 
				-			<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>--config confdir</code></td>
			
 
				-			            <td>Overwrites the default Configuration directory. Default is ${HADOOP_PREFIX}/conf.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>GENERIC_OPTIONS</code></td>
			
 
				-			            <td>The common set of options supported by multiple commands.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>COMMAND</code><br/><code>COMMAND_OPTIONS</code></td>
			
 
				-			            <td>Various commands with their options are described in the following sections. The commands 
			
 
				-			            have been grouped into <a href="commands_manual.html#User+Commands">User Commands</a> 
			
 
				-			            and <a href="commands_manual.html#Administration+Commands">Administration Commands</a>.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			 <section>
			
 
				-				<title>Generic Options</title>
			
 
				-				<p>
			
 
				-				  The following options are supported by <a href="commands_manual.html#dfsadmin">dfsadmin</a>, 
			
 
				-                  <a href="commands_manual.html#fs">fs</a>, <a href="commands_manual.html#fsck">fsck</a>, 
			
 
				-                  <a href="commands_manual.html#job">job</a> and <a href="commands_manual.html#fetchdt">fetchdt</a>.
			
 
				-				  Applications should implement 
			
 
				-				  <a href="ext:api/org/apache/hadoop/util/tool">Tool</a> to support
			
 
				-				  <a href="ext:api/org/apache/hadoop/util/genericoptionsparser">
			
 
				-				  GenericOptions</a>.
			
 
				-				</p>
			
 
				-			     <table>
			
 
				-			          <tr><th> GENERIC_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-conf &lt;configuration file&gt;</code></td>
			
 
				-			            <td>Specify an application configuration file.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-D &lt;property=value&gt;</code></td>
			
 
				-			            <td>Use value for given property.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-fs &lt;local|namenode:port&gt;</code></td>
			
 
				-			            <td>Specify a namenode.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-jt &lt;local|jobtracker:port&gt;</code></td>
			
 
				-			            <td>Specify a job tracker. Applies only to <a href="commands_manual.html#job">job</a>.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-files &lt;comma separated list of files&gt;</code></td>
			
 
				-			            <td>Specify comma separated files to be copied to the map reduce cluster. 
			
 
				-			            Applies only to <a href="commands_manual.html#job">job</a>.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-libjars &lt;comma seperated list of jars&gt;</code></td>
			
 
				-			            <td>Specify comma separated jar files to include in the classpath. 
			
 
				-			            Applies only to <a href="commands_manual.html#job">job</a>.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-archives &lt;comma separated list of archives&gt;</code></td>
			
 
				-			            <td>Specify comma separated archives to be unarchived on the compute machines. 
			
 
				-			            Applies only to <a href="commands_manual.html#job">job</a>.</td>
			
 
				-			           </tr>
			
 
				-				</table>
			
 
				-			</section>	   
			
 
				-		</section>
			
 
				-		
			
 
				-		<section>
			
 
				-			<title> User Commands </title>
			
 
				-			<p>Commands useful for users of a Hadoop cluster.</p>
			
 
				-			<section>
			
 
				-				<title> archive </title>
			
 
				-				<p>
			
 
				-					Creates a Hadoop archive. More information see the <a href="ext:hadoop-archives">Hadoop Archives Guide</a>.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop archive -archiveName NAME &lt;src&gt;* &lt;dest&gt;</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-					   <tr>
			
 
				-			          	<td><code>-archiveName NAME</code></td>
			
 
				-			            <td>Name of the archive to be created.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>src</code></td>
			
 
				-			            <td>Filesystem pathnames which work as usual with regular expressions.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>dest</code></td>
			
 
				-			            <td>Destination directory which would contain the archive.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> distcp </title>
			
 
				-				<p>
			
 
				-					Copy file or directories recursively. More information can be found at <a href="ext:distcp">DistCp Guide</a>.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop distcp &lt;srcurl&gt; &lt;desturl&gt;</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>srcurl</code></td>
			
 
				-			            <td>Source Url</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>desturl</code></td>
			
 
				-			            <td>Destination Url</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			       
			
 
				-			<section>
			
 
				-				<title> fs </title>
			
 
				-				<p>
			
 
				-					Runs a generic filesystem user client.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop fs [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] 
			
 
				-					[COMMAND_OPTIONS]</code>
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					The various COMMAND_OPTIONS can be found at 
			
 
				-					<a href="file_system_shell.html">File System Shell Guide</a>.
			
 
				-				</p>   
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> fsck </title>
			
 
				-				<p>
			
 
				-					Runs a HDFS filesystem checking utility. See <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Fsck">Fsck</a> for more info.
			
 
				-				</p> 
			
 
				-				<p><code>Usage: hadoop fsck [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] 
			
 
				-				&lt;path&gt; [-move | -delete | -openforwrite] [-files [-blocks 
			
 
				-				[-locations | -racks]]]</code></p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			          <tr>
			
 
				-			            <td><code>&lt;path&gt;</code></td>
			
 
				-			            <td>Start checking from this path.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-move</code></td>
			
 
				-			            <td>Move corrupted files to /lost+found</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-delete</code></td>
			
 
				-			            <td>Delete corrupted files.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-openforwrite</code></td>
			
 
				-			            <td>Print out files opened for write.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-files</code></td>
			
 
				-			            <td>Print out files being checked.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-blocks</code></td>
			
 
				-			            <td>Print out block report.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-locations</code></td>
			
 
				-			            <td>Print out locations for every block.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-racks</code></td>
			
 
				-			            <td>Print out network topology for data-node locations.</td>
			
 
				-			           </tr>
			
 
				-					</table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-              <title> fetchdt </title>
			
 
				-              <p>
			
 
				-                Gets Delegation Token from a NameNode. See <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#fetchdt">fetchdt</a> for more info.
			
 
				-              </p> 
			
 
				-              <p><code>Usage: hadoop fetchdt [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] 
			
 
				-                 [--webservice &lt;namenode_http_addr&gt;] &lt;file_name&gt; </code></p>
			
 
				-                 <table>
			
 
				-                   <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-                   <tr>
			
 
				-                     <td><code>&lt;file_name&gt;</code></td>
			
 
				-                     <td>File name to store the token into.</td>
			
 
				-                   </tr>
			
 
				-                   <tr>
			
 
				-                     <td><code>--webservice &lt;https_address&gt;</code></td>
			
 
				-                     <td>use http protocol instead of RPC</td>
			
 
				-                   </tr>
			
 
				-                 </table>
			
 
				-             </section>
			
 
				-                        
			
 
				-             <section>
			
 
				-				<title> jar </title>
			
 
				-				<p>
			
 
				-					Runs a jar file. Users can bundle their Map Reduce code in a jar file and execute it using this command.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop jar &lt;jar&gt; [mainClass] args...</code>
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					The streaming jobs are run via this command. For examples, see 
			
 
				-					<a href="ext:streaming">Hadoop Streaming</a>.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					The WordCount example is also run using jar command. For examples, see the
			
 
				-					<a href="ext:mapred-tutorial">MapReduce Tutorial</a>.
			
 
				-				</p>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> job </title>
			
 
				-				<p>
			
 
				-					Command to interact with Map Reduce Jobs.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop job [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] 
			
 
				-					[-submit &lt;job-file&gt;] | [-status &lt;job-id&gt;] | 
			
 
				-					[-counter &lt;job-id&gt; &lt;group-name&gt; &lt;counter-name&gt;] | [-kill &lt;job-id&gt;] | 
			
 
				-					[-events &lt;job-id&gt; &lt;from-event-#&gt; &lt;#-of-events&gt;] | [-history [all] &lt;historyFile&gt;] |
			
 
				-					[-list [all]] | [-kill-task &lt;task-id&gt;] | [-fail-task &lt;task-id&gt;] | 
			
 
				-          [-set-priority &lt;job-id&gt; &lt;priority&gt;]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-submit &lt;job-file&gt;</code></td>
			
 
				-			            <td>Submits the job.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-status &lt;job-id&gt;</code></td>
			
 
				-			            <td>Prints the map and reduce completion percentage and all job counters.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-counter &lt;job-id&gt; &lt;group-name&gt; &lt;counter-name&gt;</code></td>
			
 
				-			            <td>Prints the counter value.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-kill &lt;job-id&gt;</code></td>
			
 
				-			            <td>Kills the job.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-events &lt;job-id&gt; &lt;from-event-#&gt; &lt;#-of-events&gt;</code></td>
			
 
				-			            <td>Prints the events' details received by jobtracker for the given range.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-history [all] &lt;historyFile&gt;</code></td>
			
 
				-			            <td>-history &lt;historyFile&gt; prints job details, failed and killed tip details. More details 
			
 
				-			            about the job such as successful tasks and task attempts made for each task can be viewed by 
			
 
				-			            specifying the [all] option. </td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-list [all]</code></td>
			
 
				-			            <td>-list all displays all jobs. -list displays only jobs which are yet to complete.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-kill-task &lt;task-id&gt;</code></td>
			
 
				-			            <td>Kills the task. Killed tasks are NOT counted against failed attempts.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-fail-task &lt;task-id&gt;</code></td>
			
 
				-			            <td>Fails the task. Failed tasks are counted against failed attempts.</td>
			
 
				-			           </tr>
			
 
				-                 <tr>
			
 
				-                  <td><code>-set-priority &lt;job-id&gt; &lt;priority&gt;</code></td>
			
 
				-                  <td>Changes the priority of the job. 
			
 
				-                  Allowed priority values are VERY_HIGH, HIGH, NORMAL, LOW, VERY_LOW</td>
			
 
				-                 </tr>
			
 
				-					</table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> pipes </title>
			
 
				-				<p>
			
 
				-					Runs a pipes job.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop pipes [-conf &lt;path&gt;] [-jobconf &lt;key=value&gt;, &lt;key=value&gt;, ...] 
			
 
				-					[-input &lt;path&gt;] [-output &lt;path&gt;] [-jar &lt;jar file&gt;] [-inputformat &lt;class&gt;] 
			
 
				-					[-map &lt;class&gt;] [-partitioner &lt;class&gt;] [-reduce &lt;class&gt;] [-writer &lt;class&gt;] 
			
 
				-					[-program &lt;executable&gt;] [-reduces &lt;num&gt;] </code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			          <tr>
			
 
				-			          	<td><code>-conf &lt;path&gt;</code></td>
			
 
				-			            <td>Configuration for job</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-jobconf &lt;key=value&gt;, &lt;key=value&gt;, ...</code></td>
			
 
				-			            <td>Add/override configuration for job</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-input &lt;path&gt;</code></td>
			
 
				-			            <td>Input directory</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-output &lt;path&gt;</code></td>
			
 
				-			            <td>Output directory</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-jar &lt;jar file&gt;</code></td>
			
 
				-			            <td>Jar filename</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-inputformat &lt;class&gt;</code></td>
			
 
				-			            <td>InputFormat class</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-map &lt;class&gt;</code></td>
			
 
				-			            <td>Java Map class</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-partitioner &lt;class&gt;</code></td>
			
 
				-			            <td>Java Partitioner</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-reduce &lt;class&gt;</code></td>
			
 
				-			            <td>Java Reduce class</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-writer &lt;class&gt;</code></td>
			
 
				-			            <td>Java RecordWriter</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-program &lt;executable&gt;</code></td>
			
 
				-			            <td>Executable URI</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-reduces &lt;num&gt;</code></td>
			
 
				-			            <td>Number of reduces</td>
			
 
				-			           </tr>
			
 
				-					</table>
			
 
				-			</section>
			
 
				-      <section>
			
 
				-        <title> queue </title>
			
 
				-        <p>
			
 
				-          command to interact and view Job Queue information
			
 
				-        </p>
			
 
				-        <p>
			
 
				-          <code>Usage : hadoop queue [-list] | [-info &lt;job-queue-name&gt; [-showJobs]] | [-showacls]</code>
			
 
				-        </p>
			
 
				-        <table>
			
 
				-        <tr>
			
 
				-          <th> COMMAND_OPTION </th><th> Description </th>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td><anchor id="QueuesList"/><code>-list</code> </td>
			
 
				-          <td>Gets list of Job Queues configured in the system. Along with scheduling information
			
 
				-          associated with the job queues.
			
 
				-          </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td><anchor id="QueuesInfo"/><code>-info &lt;job-queue-name&gt; [-showJobs]</code></td>
			
 
				-          <td>
			
 
				-           Displays the job queue information and associated scheduling information of particular
			
 
				-           job queue. If -showJobs options is present a list of jobs submitted to the particular job
			
 
				-           queue is displayed. 
			
 
				-          </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-          <td><code>-showacls</code></td>
			
 
				-          <td>Displays the queue name and associated queue operations allowed for the current user.
			
 
				-          The list consists of only those queues to which the user has access.
			
 
				-          </td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-      </section>  	
			
 
				-			<section>
			
 
				-				<title> version </title>
			
 
				-				<p>
			
 
				-					Prints the version.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop version</code>
			
 
				-				</p>
			
 
				-			</section>
			
 
				-			<section>
			
 
				-				<title> CLASSNAME </title>
			
 
				-				<p>
			
 
				-					 Hadoop script can be used to invoke any class.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					 Runs the class named CLASSNAME.
			
 
				-				</p>
			
 
				-
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop CLASSNAME</code>
			
 
				-				</p>
			
 
				-
			
 
				-			</section>
			
 
				-    </section>
			
 
				-		<section>
			
 
				-			<title> Administration Commands </title>
			
 
				-			<p>Commands useful for administrators of a Hadoop cluster.</p>
			
 
				-			<section>
			
 
				-				<title> balancer </title>
			
 
				-				<p>
			
 
				-					Runs a cluster balancing utility. An administrator can simply press Ctrl-C to stop the 
			
 
				-					rebalancing process. For more details see 
			
 
				-					<a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Rebalancer">Rebalancer</a>.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop balancer [-policy &lt;blockpool|datanode&gt;] [-threshold &lt;threshold&gt;]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			           <tr>
			
 
				-					<td><code>-policy &lt;blockpool|datanode&gt;</code></td>
			
 
				-					<td>The balancing policy.
			
 
				-					    <br /><code>datanode</code>: Cluster is balance if the disk usage of each datanode is balance.
			
 
				-					    <br /><code>blockpool</code>: Cluster is balance if the disk usage of each block pool in each datanode is balance.
			
 
				-					    <br />Note that <code>blockpool</code> is a condition stronger than <code>datanode</code>.
			
 
				-					    The default policy is <code>datanode</code>.
			
 
				-					</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-threshold &lt;threshold&gt;</code></td>
			
 
				-			            <td>Percentage of disk capacity. This default threshold is 10%.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> daemonlog </title>
			
 
				-				<p>
			
 
				-					 Get/Set the log level for each daemon.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop daemonlog  -getlevel &lt;host:port&gt; &lt;name&gt;</code><br/>
			
 
				-					<code>Usage: hadoop daemonlog  -setlevel &lt;host:port&gt; &lt;name&gt; &lt;level&gt;</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-getlevel &lt;host:port&gt; &lt;name&gt;</code></td>
			
 
				-			            <td>Prints the log level of the daemon running at &lt;host:port&gt;. 
			
 
				-			            This command internally connects to http://&lt;host:port&gt;/logLevel?log=&lt;name&gt;</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-setlevel &lt;host:port&gt; &lt;name&gt; &lt;level&gt;</code></td>
			
 
				-			            <td>Sets the log level of the daemon running at &lt;host:port&gt;. 
			
 
				-			            This command internally connects to http://&lt;host:port&gt;/logLevel?log=&lt;name&gt;</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> datanode</title>
			
 
				-				<p>
			
 
				-					Runs a HDFS datanode.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop datanode [-rollback]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-rollback</code></td>
			
 
				-			            <td>Rollsback the datanode to the previous version. This should be used after stopping the datanode 
			
 
				-			            and distributing the old Hadoop version.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> dfsadmin </title>
			
 
				-				<p>
			
 
				-					Runs a HDFS dfsadmin client.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop dfsadmin  [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] [-report] [-safemode enter | leave | get | wait] [-refreshNodes]
			
 
				-					 [-finalizeUpgrade] [-upgradeProgress status | details | force] [-metasave filename] 
			
 
				-					 [-setQuota &lt;quota&gt; &lt;dirname&gt;...&lt;dirname&gt;] [-clrQuota &lt;dirname&gt;...&lt;dirname&gt;] 
			
 
				-					 [-restoreFailedStorage true|false|check] 
			
 
				-					 [-help [cmd]]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-report</code></td>
			
 
				-			            <td>Reports basic filesystem information and statistics.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-safemode enter | leave | get | wait</code></td>
			
 
				-			            <td>Safe mode maintenance command.
			
 
				-                Safe mode is a Namenode state in which it <br/>
			
 
				-                        1.  does not accept changes to the name space (read-only) <br/> 
			
 
				-                        2.  does not replicate or delete blocks. <br/>
			
 
				-                Safe mode is entered automatically at Namenode startup, and
			
 
				-                leaves safe mode automatically when the configured minimum
			
 
				-                percentage of blocks satisfies the minimum replication
			
 
				-                condition.  Safe mode can also be entered manually, but then
			
 
				-                it can only be turned off manually as well.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-refreshNodes</code></td>
			
 
				-			            <td>Re-read the hosts and exclude files to update the set
			
 
				-                of Datanodes that are allowed to connect to the Namenode
			
 
				-                and those that should be decommissioned or recommissioned.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-finalizeUpgrade</code></td>
			
 
				-			            <td>Finalize upgrade of HDFS.
			
 
				-                Datanodes delete their previous version working directories,
			
 
				-                followed by Namenode doing the same.
			
 
				-                This completes the upgrade process.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-printTopology</code></td>
			
 
				-			            <td>Print a tree of the rack/datanode topology of the
			
 
				-                 cluster as seen by the NameNode.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-upgradeProgress status | details | force</code></td>
			
 
				-			            <td>Request current distributed upgrade status,
			
 
				-                a detailed status or force the upgrade to proceed.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-metasave filename</code></td>
			
 
				-			            <td>Save Namenode's primary data structures
			
 
				-                to &lt;filename&gt; in the directory specified by hadoop.log.dir property.
			
 
				-                &lt;filename&gt; will contain one line for each of the following <br/>
			
 
				-                        1. Datanodes heart beating with Namenode<br/>
			
 
				-                        2. Blocks waiting to be replicated<br/>
			
 
				-                        3. Blocks currrently being replicated<br/>
			
 
				-                        4. Blocks waiting to be deleted</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-setQuota &lt;quota&gt; &lt;dirname&gt;...&lt;dirname&gt;</code></td>
			
 
				-			            <td>Set the quota &lt;quota&gt; for each directory &lt;dirname&gt;.
			
 
				-                The directory quota is a long integer that puts a hard limit on the number of names in the directory tree.<br/>
			
 
				-                Best effort for the directory, with faults reported if<br/>
			
 
				-                1. N is not a positive integer, or<br/>
			
 
				-                2. user is not an administrator, or<br/>
			
 
				-                3. the directory does not exist or is a file, or<br/>
			
 
				-                4. the directory would immediately exceed the new quota.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-clrQuota &lt;dirname&gt;...&lt;dirname&gt;</code></td>
			
 
				-			            <td>Clear the quota for each directory &lt;dirname&gt;.<br/>
			
 
				-                Best effort for the directory. with fault reported if<br/>
			
 
				-                1. the directory does not exist or is a file, or<br/>
			
 
				-                2. user is not an administrator.<br/>
			
 
				-                It does not fault if the directory has no quota.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-restoreFailedStorage true | false | check</code></td>
			
 
				-			            <td>This option will turn on/off automatic attempt to restore failed storage replicas. 
			
 
				-			            If a failed storage becomes available again the system will attempt to restore 
			
 
				-			            edits and/or fsimage during checkpoint. 'check' option will return current setting.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-help [cmd]</code></td>
			
 
				-			            <td> Displays help for the given command or all commands if none
			
 
				-                is specified.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			<section>
			
 
				-        <title>mradmin</title>
			
 
				-        <p>Runs MR admin client</p>
			
 
				-        <p><code>Usage: hadoop mradmin  [</code>
			
 
				-        <a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a>
			
 
				-        <code>] [-refreshServiceAcl] [-refreshQueues] [-refreshNodes] [-help [cmd]] </code></p>
			
 
				-        <table>
			
 
				-        <tr>
			
 
				-        <th> COMMAND_OPTION </th><th> Description </th>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td><code>-refreshServiceAcl</code></td>
			
 
				-        <td> Reload the service-level authorization policies. Jobtracker
			
 
				-         will reload the authorization policy file.</td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td><anchor id="RefreshQueues"/><code>-refreshQueues</code></td>
			
 
				-        <td><p> Reload the queues' configuration at the JobTracker.
			
 
				-          Most of the configuration of the queues can be refreshed/reloaded
			
 
				-          without restarting the Map/Reduce sub-system. Administrators
			
 
				-          typically own the
			
 
				-          <a href="cluster_setup.html#mapred-queues.xml">
			
 
				-          <em>conf/mapred-queues.xml</em></a>
			
 
				-          file, can edit it while the JobTracker is still running, and can do
			
 
				-          a reload by running this command.</p>
			
 
				-          <p>It should be noted that while trying to refresh queues'
			
 
				-          configuration, one cannot change the hierarchy of queues itself.
			
 
				-          This means no operation that involves a change in either the
			
 
				-          hierarchy structure itself or the queues' names will be allowed.
			
 
				-          Only selected properties of queues can be changed during refresh.
			
 
				-          For example, new queues cannot be added dynamically, neither can an
			
 
				-          existing queue be deleted.</p>
			
 
				-          <p>If during a reload of queue configuration,
			
 
				-          a syntactic or semantic error in made during the editing of the
			
 
				-          configuration file, the refresh command fails with an exception that
			
 
				-          is printed on the standard output of this command, thus informing the
			
 
				-          requester with any helpful messages of what has gone wrong during
			
 
				-          the edit/reload. Importantly, the existing queue configuration is
			
 
				-          untouched and the system is left in a consistent state.
			
 
				-          </p>
			
 
				-          <p>As described in the
			
 
				-          <a href="cluster_setup.html#mapred-queues.xml"><em>
			
 
				-          conf/mapred-queues.xml</em></a> section, the
			
 
				-          <a href="cluster_setup.html#properties_tag"><em>
			
 
				-          &lt;properties&gt;</em></a> tag in the queue configuration file can
			
 
				-          also be used to specify per-queue properties needed by the scheduler.
			
 
				-           When the framework's queue configuration is reloaded using this
			
 
				-          command, this scheduler specific configuration will also be reloaded
			
 
				-          , provided the scheduler being configured supports this reload.
			
 
				-          Please see the documentation of the particular scheduler in use.</p>
			
 
				-          </td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td><code>-refreshNodes</code></td>
			
 
				-        <td> Refresh the hosts information at the jobtracker.</td>
			
 
				-        </tr>
			
 
				-        <tr>
			
 
				-        <td><code>-help [cmd]</code></td>
			
 
				-        <td>Displays help for the given command or all commands if none
			
 
				-                is specified.</td>
			
 
				-        </tr>
			
 
				-        </table>
			
 
				-      </section>
			
 
				-			<section>
			
 
				-				<title> jobtracker </title>
			
 
				-				<p>
			
 
				-					Runs the MapReduce job Tracker node.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop jobtracker [-dumpConfiguration]</code>
			
 
				-					</p>
			
 
				-          <table>
			
 
				-          <tr>
			
 
				-          <th>COMMAND_OPTION</th><th> Description</th>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-          <td><code>-dumpConfiguration</code></td>
			
 
				-          <td> Dumps the configuration used by the JobTracker alongwith queue
			
 
				-          configuration in JSON format into Standard output used by the 
			
 
				-          jobtracker and exits.</td>
			
 
				-          </tr>
			
 
				-          </table>
			
 
				-				
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> namenode </title>
			
 
				-				<p>
			
 
				-					Runs the namenode. For more information about upgrade, rollback and finalize see 
			
 
				-					<a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Upgrade+and+Rollback">Upgrade and Rollback</a>.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop namenode [-format [-force] [-nonInteractive] [-clusterid someid]] | [-upgrade] | [-rollback] | [-finalize] | [-importCheckpoint] | [-checkpoint] | [-backup]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-                <tr>
			
 
				-                  <td><code>-regular</code></td>
			
 
				-                  <td>Start namenode in standard, active role rather than as backup or checkpoint node. This is the default role.</td>
			
 
				-                </tr>
			
 
				-                <tr>
			
 
				-                  <td><code>-checkpoint</code></td>
			
 
				-                  <td>Start namenode in checkpoint role, creating periodic checkpoints of the active namenode metadata.</td>
			
 
				-                </tr>
			
 
				-                <tr>
			
 
				-                  <td><code>-backup</code></td>
			
 
				-                  <td>Start namenode in backup role, maintaining an up-to-date in-memory copy of the namespace and creating periodic checkpoints.</td>
			
 
				-                </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-format [-force] [-nonInteractive] [-clusterid someid]</code></td>
			
 
				-			            <td>Formats the namenode. It starts the namenode, formats it and then shuts it down. User will be prompted before formatting any non empty name directories in the local filesystem.<br/>
			
 
				-                                    -nonInteractive: User will not be prompted for input if non empty name directories exist in the local filesystem and the format will fail.<br/>
			
 
				-                                    -force: Formats the namenode and the user will NOT be prompted to confirm formatting of the name directories in the local filesystem. If -nonInteractive option is specified it will be ignored.<br/>
			
 
				-                                    -clusterid: Associates the namenode with the id specified. When formatting federated namenodes use this option to make sure all namenodes are associated with the same id.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-upgrade</code></td>
			
 
				-			            <td>Namenode should be started with upgrade option after the distribution of new Hadoop version.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-rollback</code></td>
			
 
				-			            <td>Rollsback the namenode to the previous version. This should be used after stopping the cluster 
			
 
				-			            and distributing the old Hadoop version.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-finalize</code></td>
			
 
				-			            <td>Finalize will remove the previous state of the files system. Recent upgrade will become permanent. 
			
 
				-			            Rollback option will not be available anymore. After finalization it shuts the namenode down.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-importCheckpoint</code></td>
			
 
				-			            <td>Loads image from a checkpoint directory and saves it into the current one. Checkpoint directory 
			
 
				-			            is read from property dfs.namenode.checkpoint.dir
			
 
				-			            (see <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Import+checkpoint">Import Checkpoint</a>).
			
 
				-			            </td>
			
 
				-			           </tr>
			
 
				-			            <tr>
			
 
				-			          	<td><code>-checkpoint</code></td>
			
 
				-			            <td>Enables checkpointing 
			
 
				-			            (see <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Checkpoint+Node">Checkpoint Node</a>).</td>
			
 
				-			           </tr>
			
 
				-			            <tr>
			
 
				-			          	<td><code>-backup</code></td>
			
 
				-			            <td>Enables checkpointing and maintains an in-memory, up-to-date copy of the file system namespace 
			
 
				-			            (see <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Backup+Node">Backup Node</a>).</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> secondarynamenode </title>
			
 
				-				<p>	
			
 
				-					Runs the HDFS secondary 
			
 
				-					namenode. See <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Secondary+NameNode">Secondary NameNode</a> 
			
 
				-					for more info.
			
 
				-				</p>
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop secondarynamenode [-checkpoint [force]] | [-geteditsize]</code>
			
 
				-				</p>
			
 
				-				<table>
			
 
				-			          <tr><th> COMMAND_OPTION </th><th> Description </th></tr>
			
 
				-			
			
 
				-			           <tr>
			
 
				-			          	<td><code>-checkpoint [force]</code></td>
			
 
				-			            <td>Checkpoints the Secondary namenode if EditLog size >= dfs.namenode.checkpoint.size. 
			
 
				-			            If -force is used, checkpoint irrespective of EditLog size.</td>
			
 
				-			           </tr>
			
 
				-			           <tr>
			
 
				-			          	<td><code>-geteditsize</code></td>
			
 
				-			            <td>Prints the EditLog size.</td>
			
 
				-			           </tr>
			
 
				-			     </table>
			
 
				-			</section>
			
 
				-			
			
 
				-			<section>
			
 
				-				<title> tasktracker </title>
			
 
				-				<p>
			
 
				-					Runs a MapReduce task Tracker node.
			
 
				-				</p> 
			
 
				-				<p>
			
 
				-					<code>Usage: hadoop tasktracker</code>
			
 
				-				</p>
			
 
				-			</section>
			
 
				-			
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-		      
			
 
				-
			
 
				-	</body>
			
 
				-</document>      
			
--- a/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/file_system_shell.xml
+++ b/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/file_system_shell.xml
@@ -1,594 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-<document>
			
 
				-	<header>
			
 
				-		<title>File System Shell Guide</title>
			
 
				-	</header>
			
 
				-	<body>
			
 
				-		<section>
			
 
				-			<title>Overview</title>
			
 
				-			<p>
			
 
				-      The File System (FS) shell includes various shell-like commands that directly
			
 
				-      interact with the Hadoop Distributed File System (HDFS) as well as other file systems that Hadoop supports,  
			
 
				-      such as Local FS, HFTP FS, S3 FS, and others. The FS shell is invoked by: </p>
			
 
				-
			
 
				-    <source>bin/hdfs dfs &lt;args&gt;</source>
			
 
				-    
			
 
				-      <p>
			
 
				-      All FS shell commands take path URIs as arguments. The URI
			
 
				-      format is <em>scheme://autority/path</em>. For HDFS the scheme
			
 
				-      is <em>hdfs</em>, and for the Local FS the scheme
			
 
				-      is <em>file</em>. The scheme and authority are optional. If not
			
 
				-      specified, the default scheme specified in the configuration is
			
 
				-      used. An HDFS file or directory such as <em>/parent/child</em>
			
 
				-      can be specified as <em>hdfs://namenodehost/parent/child</em> or
			
 
				-      simply as <em>/parent/child</em> (given that your configuration
			
 
				-      is set to point to <em>hdfs://namenodehost</em>). 
			
 
				-      </p>
			
 
				-     <p>
			
 
				-      Most of the commands in FS shell behave like corresponding Unix
			
 
				-      commands. Differences are described with each of the
			
 
				-      commands. Error information is sent to <em>stderr</em> and the
			
 
				-      output is sent to <em>stdout</em>.
			
 
				-  </p>
			
 
				-  
			
 
				-  
			
 
				-<!-- CAT --> 
			
 
				-		<section>
			
 
				-			<title> cat </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -cat URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-		   Copies source paths to <em>stdout</em>. 
			
 
				-		   </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -cat hdfs://nn1.example.com/file1 hdfs://nn2.example.com/file2 
			
 
				-		   </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code>hdfs dfs -cat file:///file3 /user/hadoop/file4 </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:<br/>
			
 
				-		   <code> Returns 0 on success and -1 on error. </code></p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- CHGRP --> 
			
 
				-		<section>
			
 
				-			<title> chgrp </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -chgrp [-R] GROUP URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Change group association of files. With <code>-R</code>, make the change recursively through the directory structure. 
			
 
				-	    The user must be the owner of files, or else a super-user. 
			
 
				-	    Additional information is in the <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_permissions_guide.html">HDFS Permissions Guide</a>.
			
 
				-	    </p>
			
 
				-		</section>
			
 
				-		<section>
			
 
				-			<title> chmod </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -chmod [-R] &lt;MODE[,MODE]... | OCTALMODE&gt; URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Change the permissions of files. With <code>-R</code>, make the change recursively through the directory structure. 
			
 
				-	    The user must be the owner of the file, or else a super-user. 
			
 
				-	    Additional information is in the <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_permissions_guide.html">HDFS Permissions Guide</a>.
			
 
				-	    </p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- CHOWN --> 		
			
 
				-		<section>
			
 
				-			<title> chown </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -chown [-R] [OWNER][:[GROUP]] URI [URI ]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Change the owner of files. With <code>-R</code>, make the change recursively through the directory structure. 
			
 
				-	    The user must be a super-user. 
			
 
				-	    Additional information is in the <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_permissions_guide.html">HDFS Permissions Guide</a>.
			
 
				-	    </p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- COPYFROMLOCAL --> 		
			
 
				-		<section>
			
 
				-			<title>copyFromLocal</title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -copyFromLocal &lt;localsrc&gt; URI</code>
			
 
				-			</p>
			
 
				-			<p>Similar to <a href="#put"><strong>put</strong></a> command, except that the source is restricted to a local file reference. </p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- COPYTOLOCAL -->
			
 
				-		<section>
			
 
				-			<title> copyToLocal</title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -copyToLocal [-ignorecrc] [-crc] URI &lt;localdst&gt;</code>
			
 
				-			</p>
			
 
				-			<p> Similar to <a href="#get"><strong>get</strong></a> command, except that the destination is restricted to a local file reference.</p>
			
 
				-		</section>
			
 
				-		
			
 
				-<!-- COUNT -->		
			
 
				-		<section>
			
 
				-			<title> count </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -count [-q]  &lt;paths&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-				Count the number of directories, files and bytes under the paths that match the specified file pattern. <br/><br/>
			
 
				-				The output columns with <code>-count </code> are:<br/><br/>
			
 
				-				<code>DIR_COUNT, FILE_COUNT, CONTENT_SIZE FILE_NAME</code> <br/><br/>
			
 
				-				The output columns with <code>-count -q</code> are:<br/><br/>
			
 
				-				<code>QUOTA, REMAINING_QUATA, SPACE_QUOTA, REMAINING_SPACE_QUOTA, 
			
 
				-				DIR_COUNT, FILE_COUNT, CONTENT_SIZE, FILE_NAME</code>
			
 
				-		   </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -count hdfs://nn1.example.com/file1 hdfs://nn2.example.com/file2 
			
 
				-		   </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -count -q hdfs://nn1.example.com/file1
			
 
				-		   </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- CP -->		
			
 
				-		<section>
			
 
				-			<title> cp </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -cp URI [URI &#x2026;] &lt;dest&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Copy files from source to destination. This command allows multiple sources as well in which case the destination must be a directory.
			
 
				-	    <br/>
			
 
				-	    Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -cp /user/hadoop/file1 /user/hadoop/file2</code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -cp /user/hadoop/file1 /user/hadoop/file2 /user/hadoop/dir </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		
			
 
				-<!-- DU -->
			
 
				-		<section>
			
 
				-			<title>du</title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -du [-s] [-h] URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	     Displays sizes of files and directories contained in the given directory or the length of a file in case its just a file.</p>
			
 
				-             <p>Options:</p>
			
 
				-             <ul>
			
 
				-             <li>The <code>-s</code> option will result in an aggregate summary of file lengths being displayed, rather than the individual files.</li>
			
 
				-             <li>The <code>-h</code> option will format file sizes in a &quot;human-readable&quot; fashion (e.g 64.0m instead of 67108864)</li>
			
 
				-             </ul>
			
 
				-             <p>
			
 
				-	     Example:<br/><code>hdfs dfs -du /user/hadoop/dir1 /user/hadoop/file1 hdfs://nn.example.com/user/hadoop/dir1</code><br/>
			
 
				-	     Exit Code:<br/><code> Returns 0 on success and -1 on error. </code><br/></p>
			
 
				-		</section>
			
 
				-		
			
 
				-<!-- DUS -->		
			
 
				-		<section>
			
 
				-			<title> dus </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -dus &lt;args&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Displays a summary of file lengths. This is an alternate form of <code>hdfs dfs -du -s</code>.
			
 
				-	   </p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- EXPUNGE -->		
			
 
				-		<section>
			
 
				-			<title> expunge </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -expunge</code>
			
 
				-			</p>
			
 
				-			<p>Empty the Trash. Refer to the <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_design.html">HDFS Architecture Guide</a>
			
 
				-			 for more information on the Trash feature.</p>
			
 
				-		</section>
			
 
				-
			
 
				-
			
 
				-<!-- GET -->			
			
 
				-		<section>
			
 
				-			<title> get </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -get [-ignorecrc] [-crc] &lt;src&gt; &lt;localdst&gt;</code>
			
 
				-				<br/>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Copy files to the local file system. Files that fail the CRC check may be copied with the  
			
 
				-	   <code>-ignorecrc</code> option. Files and CRCs may be copied using the 
			
 
				-	   <code>-crc</code> option.
			
 
				-	  </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -get /user/hadoop/file localfile </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -get hdfs://nn.example.com/user/hadoop/file localfile</code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error. </code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- GETMERGE -->			
			
 
				-		<section>
			
 
				-			<title> getmerge </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -getmerge [-nl] &lt;src&gt; &lt;localdst&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	  Takes a source directory and a destination file as input and concatenates files in src into the destination local file. 
			
 
				-	  Optionally <code>-nl</code> flag can be set to enable adding a newline character at the end of each file during merge.
			
 
				-	  </p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- LS -->		
			
 
				-       <section>
			
 
				-           <title>ls</title>
			
 
				-           <p>
			
 
				-               <code>Usage: hdfs dfs -ls [-d] [-h] [-R] &lt;args&gt;</code>
			
 
				-           </p>
			
 
				-           <p>For a file returns stat on the file with the following format:</p>
			
 
				-           <p>
			
 
				-               <code>permissions number_of_replicas userid  groupid  filesize modification_date modification_time filename</code>
			
 
				-           </p>
			
 
				-           <p>For a directory it returns list of its direct children as in unix.A directory is listed as:</p>
			
 
				-           <p>
			
 
				-               <code>permissions userid groupid modification_date modification_time dirname</code>
			
 
				-           </p>
			
 
				-           <p>Options:</p>
			
 
				-             <ul>
			
 
				-               <li><code>-d</code>  Directories are listed as plain files</li>
			
 
				-               <li><code>-h</code>  Format file sizes in a &quot;human-readable&quot; fashion (e.g 64.0m instead of 67108864)</li>
			
 
				-               <li><code>-R</code>  Recursively list subdirectories encountered</li>
			
 
				-             </ul>
			
 
				-           <p>Example:</p>
			
 
				-           <p>
			
 
				-               <code>hdfs dfs -ls /user/hadoop/file1 </code>
			
 
				-           </p>
			
 
				-           <p>Exit Code:</p>
			
 
				-           <p>
			
 
				-               <code>Returns 0 on success and -1 on error.</code>
			
 
				-           </p>
			
 
				-       </section>
			
 
				-       
			
 
				-       
			
 
				-<!-- LSR -->       
			
 
				-		<section>
			
 
				-			<title>lsr</title>
			
 
				-			<p><code>Usage: hdfs dfs -lsr &lt;args&gt;</code><br/>
			
 
				-	      Recursive version of <code>ls</code>. Similar to Unix <code>ls -R</code>.
			
 
				-	      </p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- MKDIR -->  
			
 
				-		<section>
			
 
				-			<title> mkdir </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -mkdir &lt;paths&gt;</code>
			
 
				-				<br/>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Takes path uri's as argument and creates directories. The behavior is much like unix mkdir -p creating parent directories along the path.
			
 
				-	  </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code>hdfs dfs -mkdir /user/hadoop/dir1 /user/hadoop/dir2 </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code>hdfs dfs -mkdir hdfs://nn1.example.com/user/hadoop/dir hdfs://nn2.example.com/user/hadoop/dir
			
 
				-	  </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code>Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- MOVEFROMLOCAL -->  
			
 
				-		<section>
			
 
				-			<title> moveFromLocal </title>
			
 
				-			<p>
			
 
				-				<code>Usage: dfs -moveFromLocal &lt;localsrc&gt; &lt;dst&gt;</code>
			
 
				-			</p>
			
 
				-			<p>Similar to <a href="#put"><strong>put</strong></a> command, except that the source <code>localsrc</code> is deleted after it's copied. </p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- MOVETOLOCAL -->  
			
 
				-		<section>
			
 
				-			<title> moveToLocal</title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -moveToLocal [-crc] &lt;src&gt; &lt;dst&gt;</code>
			
 
				-			</p>
			
 
				-			<p>Displays a "Not implemented yet" message.</p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- MV -->  
			
 
				-		<section>
			
 
				-			<title> mv </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -mv URI [URI &#x2026;] &lt;dest&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	    Moves files from source to destination. This command allows multiple sources as well in which case the destination needs to be a directory. 
			
 
				-	    Moving files across file systems is not permitted.
			
 
				-	    <br/>
			
 
				-	    Example:
			
 
				-	    </p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -mv /user/hadoop/file1 /user/hadoop/file2</code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -mv hdfs://nn.example.com/file1 hdfs://nn.example.com/file2 hdfs://nn.example.com/file3 hdfs://nn.example.com/dir1</code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- PUT --> 
			
 
				-		<section>
			
 
				-			<title> put </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -put &lt;localsrc&gt; ... &lt;dst&gt;</code>
			
 
				-			</p>
			
 
				-			<p>Copy single src, or multiple srcs from local file system to the destination file system. 
			
 
				-			Also reads input from stdin and writes to destination file system.<br/>
			
 
				-	   </p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -put localfile /user/hadoop/hadoopfile</code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -put localfile1 localfile2 /user/hadoop/hadoopdir</code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -put localfile hdfs://nn.example.com/hadoop/hadoopfile</code>
			
 
				-				</li>
			
 
				-				<li><code>hdfs dfs -put - hdfs://nn.example.com/hadoop/hadoopfile</code><br/>Reads the input from stdin.</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error. </code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- RM --> 
			
 
				-		<section>
			
 
				-			<title> rm </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -rm [-skipTrash] URI [URI &#x2026;] </code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Delete files specified as args. Only deletes files. If the <code>-skipTrash</code> option
			
 
				-	   is specified, the trash, if enabled, will be bypassed and the specified file(s) deleted immediately.  	This can be
			
 
				-		   useful when it is necessary to delete files from an over-quota directory.
			
 
				-	   Use -rm -r or rmr for recursive deletes.<br/>
			
 
				-	   Example:
			
 
				-	   </p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -rm hdfs://nn.example.com/file </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error.</code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- RMR --> 
			
 
				-		<section>
			
 
				-			<title> rmr </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -rmr [-skipTrash] URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>Recursive version of delete. The rmr command recursively deletes the directory and any content under it. If the <code>-skipTrash</code> option
			
 
				-		   is specified, the trash, if enabled, will be bypassed and the specified file(s) deleted immediately. This can be
			
 
				-		   useful when it is necessary to delete files from an over-quota directory.<br/>
			
 
				-	   Example:
			
 
				-	   </p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -rmr /user/hadoop/dir </code>
			
 
				-				</li>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -rmr hdfs://nn.example.com/user/hadoop/dir </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code> Returns 0 on success and -1 on error. </code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- SETREP --> 
			
 
				-		<section>
			
 
				-			<title> setrep </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -setrep [-R] &lt;path&gt;</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Changes the replication factor of a file. -R option is for recursively increasing the replication factor of files within a directory.
			
 
				-	  </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -setrep -w 3 -R /user/hadoop/dir1 </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:</p>
			
 
				-			<p>
			
 
				-				<code>Returns 0 on success and -1 on error. </code>
			
 
				-			</p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- STAT --> 
			
 
				-		<section>
			
 
				-			<title> stat </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -stat [format] URI [URI &#x2026;]</code>
			
 
				-			</p>
			
 
				-			<p>Print statistics about the file/directory matching the given URI pattern in the specified format.</p>
			
 
				-			<p>Format accepts:</p>
			
 
				-			  <ul>
			
 
				-			    <li>filesize in blocks (%b)</li>
			
 
				-			    <li>filename (%n)</li>
			
 
				-		      <li>block size (%o)</li>
			
 
				-		      <li>replication (%r)</li>
			
 
				-		      <li>modification date, formatted as Y-M-D H:M:S (%y)</li>
			
 
				-		      <li>modification date, in epoch seconds (%Y)</li>
			
 
				-        </ul>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-        <li>
			
 
				-          <code> hdfs dfs -stat path </code>
			
 
				-        </li>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -stat %y path </code>
			
 
				-				</li>
			
 
				-        <li>
			
 
				-          <code> hdfs dfs -stat '%b %r' path </code>
			
 
				-        </li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:<br/>
			
 
				-	   <code> Returns 0 on success and -1 on error.</code></p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- TAIL--> 
			
 
				-		<section>
			
 
				-			<title> tail </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -tail [-f] URI </code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Displays last kilobyte of the file to stdout. -f option can be used as in Unix.
			
 
				-	   </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -tail pathname </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code: <br/>
			
 
				-	   <code> Returns 0 on success and -1 on error.</code></p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- TEST --> 
			
 
				-		<section>
			
 
				-			<title> test </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -test -[ezd] URI</code>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Options: <br/>
			
 
				-	   -e check to see if the file exists. Return 0 if true. <br/>
			
 
				-	   -z check to see if the file is zero length. Return 0 if true. <br/>
			
 
				-	   -d check to see if the path is directory. Return 0 if true. <br/></p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hdfs dfs -test -e filename </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- TEXT --> 
			
 
				-		<section>
			
 
				-			<title> text </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -text &lt;src&gt;</code>
			
 
				-				<br/>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Takes a source file and outputs the file in text format. The allowed formats are zip and TextRecordInputStream.
			
 
				-	  </p>
			
 
				-		</section>
			
 
				-		
			
 
				-		
			
 
				-<!-- TOUCHZ --> 
			
 
				-		<section>
			
 
				-			<title> touchz </title>
			
 
				-			<p>
			
 
				-				<code>Usage: hdfs dfs -touchz URI [URI &#x2026;]</code>
			
 
				-				<br/>
			
 
				-			</p>
			
 
				-			<p>
			
 
				-	   Create a file of zero length.
			
 
				-	   </p>
			
 
				-			<p>Example:</p>
			
 
				-			<ul>
			
 
				-				<li>
			
 
				-					<code> hadoop -touchz pathname </code>
			
 
				-				</li>
			
 
				-			</ul>
			
 
				-			<p>Exit Code:<br/>
			
 
				-	   <code> Returns 0 on success and -1 on error.</code></p>
			
 
				-		</section>
			
 
				-        </section>
			
 
				-	</body>
			
 
				-</document>
			
--- a/hadoop-common-project/hadoop-common/src/site/apt/ClusterSetup.apt.vm
+++ b/hadoop-common-project/hadoop-common/src/site/apt/ClusterSetup.apt.vm
--- a/hadoop-common-project/hadoop-common/src/site/apt/CommandsManual.apt.vm
+++ b/hadoop-common-project/hadoop-common/src/site/apt/CommandsManual.apt.vm
@@ -0,0 +1,490 @@
 
				+~~ Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				+~~ contributor license agreements.  See the NOTICE file distributed with
			
 
				+~~ this work for additional information regarding copyright ownership.
			
 
				+~~ The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				+~~ (the "License"); you may not use this file except in compliance with
			
 
				+~~ the License.  You may obtain a copy of the License at
			
 
				+~~
			
 
				+~~     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+~~
			
 
				+~~ Unless required by applicable law or agreed to in writing, software
			
 
				+~~ distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+~~ See the License for the specific language governing permissions and
			
 
				+~~ limitations under the License.
			
 
				+
			
 
				+  ---
			
 
				+  Hadoop Commands Guide
			
 
				+  ---
			
 
				+  ---
			
 
				+  ${maven.build.timestamp}
			
 
				+
			
 
				+%{toc}
			
 
				+
			
 
				+Overview
			
 
				+
			
 
				+   All hadoop commands are invoked by the <<<bin/hadoop>>> script. Running the
			
 
				+   hadoop script without any arguments prints the description for all
			
 
				+   commands.
			
 
				+
			
 
				+   Usage: <<<hadoop [--config confdir] [COMMAND] [GENERIC_OPTIONS] [COMMAND_OPTIONS]>>>
			
 
				+
			
 
				+   Hadoop has an option parsing framework that employs parsing generic
			
 
				+   options as well as running classes.
			
 
				+
			
 
				+*-----------------------+---------------+
			
 
				+|| COMMAND_OPTION       || Description
			
 
				+*-----------------------+---------------+
			
 
				+| <<<--config confdir>>>| Overwrites the default Configuration directory.  Default is <<<${HADOOP_HOME}/conf>>>.
			
 
				+*-----------------------+---------------+
			
 
				+| GENERIC_OPTIONS       | The common set of options supported by multiple commands.
			
 
				+| COMMAND_OPTIONS       | Various commands with their options are described in the following sections. The commands have been grouped into User Commands and Administration Commands.
			
 
				+*-----------------------+---------------+
			
 
				+
			
 
				+Generic Options
			
 
				+
			
 
				+   The following options are supported by {{dfsadmin}}, {{fs}}, {{fsck}},
			
 
				+   {{job}} and {{fetchdt}}. Applications should implement {{{some_useful_url}Tool}} to support
			
 
				+   {{{another_useful_url}GenericOptions}}.
			
 
				+
			
 
				+*------------------------------------------------+-----------------------------+
			
 
				+||            GENERIC_OPTION                     ||            Description
			
 
				+*------------------------------------------------+-----------------------------+
			
 
				+|<<<-conf \<configuration file\> >>>             | Specify an application
			
 
				+                                                 | configuration file.
			
 
				+*------------------------------------------------+-----------------------------+
			
 
				+|<<<-D \<property\>=\<value\> >>>                | Use value for given property.
			
 
				+*------------------------------------------------+-----------------------------+
			
 
				+|<<<-jt \<local\> or \<jobtracker:port\> >>>     | Specify a job tracker.
			
 
				+                                                 | Applies only to job.
			
 
				+*------------------------------------------------+-----------------------------+
			
 
				+|<<<-files \<comma separated list of files\> >>> | Specify comma separated files
			
 
				+                                                 | to be copied to the map
			
 
				+                                                 | reduce cluster.  Applies only
			
 
				+                                                 | to job.
			
 
				+*------------------------------------------------+-----------------------------+
			
 
				+|<<<-libjars \<comma seperated list of jars\> >>>| Specify comma separated jar
			
 
				+                                                 | files to include in the
			
 
				+                                                 | classpath. Applies only to
			
 
				+                                                 | job.
			
 
				+*------------------------------------------------+-----------------------------+
			
 
				+|<<<-archives \<comma separated list of archives\> >>> | Specify comma separated
			
 
				+                                                 | archives to be unarchived on
			
 
				+                                                 | the compute machines. Applies
			
 
				+                                                 | only to job.
			
 
				+*------------------------------------------------+-----------------------------+
			
 
				+
			
 
				+User Commands
			
 
				+
			
 
				+   Commands useful for users of a hadoop cluster.
			
 
				+
			
 
				+* <<<archive>>>
			
 
				+
			
 
				+   Creates a hadoop archive. More information can be found at Hadoop
			
 
				+   Archives.
			
 
				+
			
 
				+   Usage: <<<hadoop archive -archiveName NAME <src>* <dest> >>>
			
 
				+
			
 
				+*-------------------+-------------------------------------------------------+
			
 
				+||COMMAND_OPTION    ||                   Description
			
 
				+*-------------------+-------------------------------------------------------+
			
 
				+| -archiveName NAME |  Name of the archive to be created.
			
 
				+*-------------------+-------------------------------------------------------+
			
 
				+| src               | Filesystem pathnames which work as usual with regular
			
 
				+                    | expressions.
			
 
				+*-------------------+-------------------------------------------------------+
			
 
				+| dest              | Destination directory which would contain the archive.
			
 
				+*-------------------+-------------------------------------------------------+
			
 
				+
			
 
				+* <<<distcp>>>
			
 
				+
			
 
				+   Copy file or directories recursively. More information can be found at
			
 
				+   Hadoop DistCp Guide.
			
 
				+
			
 
				+   Usage: <<<hadoop distcp <srcurl> <desturl> >>>
			
 
				+
			
 
				+*-------------------+--------------------------------------------+
			
 
				+||COMMAND_OPTION    || Description
			
 
				+*-------------------+--------------------------------------------+
			
 
				+| srcurl            | Source Url
			
 
				+*-------------------+--------------------------------------------+
			
 
				+| desturl           | Destination Url
			
 
				+*-------------------+--------------------------------------------+
			
 
				+
			
 
				+* <<<fs>>>
			
 
				+
			
 
				+   Usage: <<<hadoop fs [GENERIC_OPTIONS] [COMMAND_OPTIONS]>>>
			
 
				+
			
 
				+   Deprecated, use <<<hdfs dfs>>> instead.
			
 
				+
			
 
				+   Runs a generic filesystem user client.
			
 
				+
			
 
				+   The various COMMAND_OPTIONS can be found at File System Shell Guide.
			
 
				+
			
 
				+* <<<fsck>>>
			
 
				+
			
 
				+   Runs a HDFS filesystem checking utility. See {{Fsck}} for more info.
			
 
				+
			
 
				+   Usage: <<<hadoop fsck [GENERIC_OPTIONS] <path> [-move | -delete | -openforwrite] [-files [-blocks [-locations | -racks]]]>>>
			
 
				+
			
 
				+*------------------+---------------------------------------------+
			
 
				+||  COMMAND_OPTION || Description
			
 
				+*------------------+---------------------------------------------+
			
 
				+|   <path>         | Start checking from this path.
			
 
				+*------------------+---------------------------------------------+
			
 
				+|   -move          | Move corrupted files to /lost+found
			
 
				+*------------------+---------------------------------------------+
			
 
				+|   -delete        | Delete corrupted files.
			
 
				+*------------------+---------------------------------------------+
			
 
				+|   -openforwrite  | Print out files opened for write.
			
 
				+*------------------+---------------------------------------------+
			
 
				+|   -files         | Print out files being checked.
			
 
				+*------------------+---------------------------------------------+
			
 
				+|   -blocks        | Print out block report.
			
 
				+*------------------+---------------------------------------------+
			
 
				+|   -locations     | Print out locations for every block.
			
 
				+*------------------+---------------------------------------------+
			
 
				+|   -racks         | Print out network topology for data-node locations.
			
 
				+*------------------+---------------------------------------------+
			
 
				+
			
 
				+* <<<fetchdt>>>
			
 
				+
			
 
				+   Gets Delegation Token from a NameNode. See {{fetchdt}} for more info.
			
 
				+
			
 
				+   Usage: <<<hadoop fetchdt [GENERIC_OPTIONS] [--webservice <namenode_http_addr>] <path> >>>
			
 
				+
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+|| COMMAND_OPTION              || Description
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| <fileName>                   | File name to store the token into.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| --webservice <https_address> | use http protocol instead of RPC
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+
			
 
				+* <<<jar>>>
			
 
				+
			
 
				+   Runs a jar file. Users can bundle their Map Reduce code in a jar file and
			
 
				+   execute it using this command.
			
 
				+
			
 
				+   Usage: <<<hadoop jar <jar> [mainClass] args...>>>
			
 
				+
			
 
				+   The streaming jobs are run via this command. Examples can be referred from
			
 
				+   Streaming examples
			
 
				+
			
 
				+   Word count example is also run using jar command. It can be referred from
			
 
				+   Wordcount example
			
 
				+
			
 
				+* <<<job>>>
			
 
				+
			
 
				+   Command to interact with Map Reduce Jobs.
			
 
				+
			
 
				+   Usage: <<<hadoop job [GENERIC_OPTIONS] [-submit <job-file>] | [-status <job-id>] | [-counter <job-id> <group-name> <counter-name>] | [-kill <job-id>] | [-events <job-id> <from-event-#> <#-of-events>] | [-history [all] <jobOutputDir>] | [-list [all]] | [-kill-task <task-id>] | [-fail-task <task-id>] | [-set-priority <job-id> <priority>]>>>
			
 
				+
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+|| COMMAND_OPTION              || Description
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -submit <job-file>           | Submits the job.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -status <job-id>             | Prints the map and reduce completion
			
 
				+                               | percentage and all job counters.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -counter <job-id> <group-name> <counter-name> | Prints the counter value.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -kill <job-id>               | Kills the job.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -events <job-id> <from-event-#> <#-of-events> | Prints the events' details
			
 
				+                               | received by jobtracker for the given range.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -history [all]<jobOutputDir> | Prints job details, failed and killed tip
			
 
				+                               | details.  More details about the job such as
			
 
				+                               | successful tasks and task attempts made for
			
 
				+                               | each task can be viewed by specifying the [all]
			
 
				+                               | option.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -list [all]                  | Displays jobs which are yet to complete.
			
 
				+                               | <<<-list all>>> displays all jobs.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -kill-task <task-id>         | Kills the task. Killed tasks are NOT counted
			
 
				+                               | against failed attempts.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -fail-task <task-id>         | Fails the task. Failed tasks are counted
			
 
				+                               | against failed attempts.
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+| -set-priority <job-id> <priority> | Changes the priority of the job. Allowed
			
 
				+                               | priority values are VERY_HIGH, HIGH, NORMAL,
			
 
				+                               | LOW, VERY_LOW
			
 
				+*------------------------------+---------------------------------------------+
			
 
				+
			
 
				+* <<<pipes>>>
			
 
				+
			
 
				+   Runs a pipes job.
			
 
				+
			
 
				+   Usage: <<<hadoop pipes [-conf <path>] [-jobconf <key=value>, <key=value>,
			
 
				+   ...] [-input <path>] [-output <path>] [-jar <jar file>] [-inputformat
			
 
				+   <class>] [-map <class>] [-partitioner <class>] [-reduce <class>] [-writer
			
 
				+   <class>] [-program <executable>] [-reduces <num>]>>>
			
 
				+ 
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+|| COMMAND_OPTION                        || Description
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -conf <path>                           | Configuration for job
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -jobconf <key=value>, <key=value>, ... | Add/override configuration for job
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -input <path>                          | Input directory
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -output <path>                         | Output directory
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -jar <jar file>                        | Jar filename
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -inputformat <class>                   | InputFormat class
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -map <class>                           | Java Map class
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -partitioner <class>                   | Java Partitioner
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -reduce <class>                        | Java Reduce class
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -writer <class>                        | Java RecordWriter
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -program <executable>                  | Executable URI
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+| -reduces <num>                         | Number of reduces
			
 
				+*----------------------------------------+------------------------------------+
			
 
				+
			
 
				+* <<<queue>>>
			
 
				+
			
 
				+   command to interact and view Job Queue information
			
 
				+
			
 
				+   Usage: <<<hadoop queue [-list] | [-info <job-queue-name> [-showJobs]] | [-showacls]>>>
			
 
				+
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+|| COMMAND_OPTION || Description
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -list           | Gets list of Job Queues configured in the system.
			
 
				+                  | Along with scheduling information associated with the job queues.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -info <job-queue-name> [-showJobs] | Displays the job queue information and
			
 
				+                  | associated scheduling information of particular job queue.
			
 
				+                  | If <<<-showJobs>>> options is present a list of jobs
			
 
				+                  | submitted to the particular job queue is displayed.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -showacls       | Displays the queue name and associated queue operations
			
 
				+                  | allowed for the current user. The list consists of only
			
 
				+                  | those queues to which the user has access.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+
			
 
				+* <<<version>>>
			
 
				+
			
 
				+   Prints the version.
			
 
				+
			
 
				+   Usage: <<<hadoop version>>>
			
 
				+
			
 
				+* <<<CLASSNAME>>>
			
 
				+
			
 
				+   hadoop script can be used to invoke any class.
			
 
				+
			
 
				+   Usage: <<<hadoop CLASSNAME>>>
			
 
				+
			
 
				+   Runs the class named <<<CLASSNAME>>>.
			
 
				+
			
 
				+* <<<classpath>>>
			
 
				+
			
 
				+   Prints the class path needed to get the Hadoop jar and the required
			
 
				+   libraries.
			
 
				+
			
 
				+   Usage: <<<hadoop classpath>>>
			
 
				+
			
 
				+Administration Commands
			
 
				+
			
 
				+   Commands useful for administrators of a hadoop cluster.
			
 
				+
			
 
				+* <<<balancer>>>
			
 
				+
			
 
				+   Runs a cluster balancing utility. An administrator can simply press Ctrl-C
			
 
				+   to stop the rebalancing process. See Rebalancer for more details.
			
 
				+
			
 
				+   Usage: <<<hadoop balancer [-threshold <threshold>]>>>
			
 
				+
			
 
				+*------------------------+-----------------------------------------------------------+
			
 
				+|| COMMAND_OPTION        | Description
			
 
				+*------------------------+-----------------------------------------------------------+
			
 
				+| -threshold <threshold> | Percentage of disk capacity. This overwrites the
			
 
				+                         | default threshold.
			
 
				+*------------------------+-----------------------------------------------------------+
			
 
				+
			
 
				+* <<<daemonlog>>>
			
 
				+
			
 
				+   Get/Set the log level for each daemon.
			
 
				+
			
 
				+   Usage: <<<hadoop daemonlog -getlevel <host:port> <name> >>>
			
 
				+   Usage: <<<hadoop daemonlog -setlevel <host:port> <name> <level> >>>
			
 
				+
			
 
				+*------------------------------+-----------------------------------------------------------+
			
 
				+|| COMMAND_OPTION              || Description
			
 
				+*------------------------------+-----------------------------------------------------------+
			
 
				+| -getlevel <host:port> <name> | Prints the log level of the daemon running at
			
 
				+                               | <host:port>. This command internally connects
			
 
				+                               | to http://<host:port>/logLevel?log=<name>
			
 
				+*------------------------------+-----------------------------------------------------------+
			
 
				+|   -setlevel <host:port> <name> <level> | Sets the log level of the daemon
			
 
				+                               | running at <host:port>. This command internally
			
 
				+                               | connects to http://<host:port>/logLevel?log=<name>
			
 
				+*------------------------------+-----------------------------------------------------------+
			
 
				+
			
 
				+* <<<datanode>>>
			
 
				+
			
 
				+   Runs a HDFS datanode.
			
 
				+
			
 
				+   Usage: <<<hadoop datanode [-rollback]>>>
			
 
				+
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+|| COMMAND_OPTION || Description
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -rollback       | Rollsback the datanode to the previous version. This should
			
 
				+                  | be used after stopping the datanode and distributing the old
			
 
				+                  | hadoop version.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+
			
 
				+* <<<dfsadmin>>>
			
 
				+
			
 
				+   Runs a HDFS dfsadmin client.
			
 
				+
			
 
				+   Usage: <<<hadoop dfsadmin [GENERIC_OPTIONS] [-report] [-safemode enter | leave | get | wait] [-refreshNodes] [-finalizeUpgrade] [-upgradeProgress status | details | force] [-metasave filename] [-setQuota <quota> <dirname>...<dirname>] [-clrQuota <dirname>...<dirname>] [-help [cmd]]>>>
			
 
				+
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+|| COMMAND_OPTION || Description
			
 
				+| -report         | Reports basic filesystem information and statistics.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -safemode enter / leave / get / wait | Safe mode maintenance command. Safe
			
 
				+                  | mode is a Namenode state in which it \
			
 
				+                  | 1. does not accept changes to the name space (read-only) \
			
 
				+                  | 2. does not replicate or delete blocks. \
			
 
				+                  | Safe mode is entered automatically at Namenode startup, and
			
 
				+                  | leaves safe mode automatically when the configured minimum
			
 
				+                  | percentage of blocks satisfies the minimum replication
			
 
				+                  | condition. Safe mode can also be entered manually, but then
			
 
				+                  | it can only be turned off manually as well.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -refreshNodes   | Re-read the hosts and exclude files to update the set of
			
 
				+                  | Datanodes that are allowed to connect to the Namenode and
			
 
				+                  | those that should be decommissioned or recommissioned.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -finalizeUpgrade| Finalize upgrade of HDFS. Datanodes delete their previous
			
 
				+                  | version working directories, followed by Namenode doing the
			
 
				+                  | same. This completes the upgrade process.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -upgradeProgress status / details / force | Request current distributed
			
 
				+                  | upgrade status, a detailed status or force the upgrade to
			
 
				+                  | proceed.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -metasave filename | Save Namenode's primary data structures to <filename> in
			
 
				+                  | the directory specified by hadoop.log.dir property.
			
 
				+                  | <filename> will contain one line for each of the following\
			
 
				+                  | 1. Datanodes heart beating with Namenode\
			
 
				+                  | 2. Blocks waiting to be replicated\
			
 
				+                  | 3. Blocks currrently being replicated\
			
 
				+                  | 4. Blocks waiting to be deleted\
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -setQuota <quota> <dirname>...<dirname> | Set the quota <quota> for each
			
 
				+                  | directory <dirname>. The directory quota is a long integer
			
 
				+                  | that puts a hard limit on the number of names in the
			
 
				+                  | directory tree.  Best effort for the directory, with faults
			
 
				+                  | reported if \
			
 
				+                  | 1. N is not a positive integer, or \
			
 
				+                  | 2. user is not an administrator, or \
			
 
				+                  | 3. the directory does not exist or is a file, or \
			
 
				+                  | 4. the directory would immediately exceed the new quota. \
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -clrQuota <dirname>...<dirname> | Clear the quota for each directory
			
 
				+                  | <dirname>.  Best effort for the directory. with fault
			
 
				+                  | reported if \
			
 
				+                  | 1. the directory does not exist or is a file, or \
			
 
				+                  | 2. user is not an administrator.  It does not fault if the
			
 
				+                  | directory has no quota.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+| -help [cmd]     | Displays help for the given command or all commands if none
			
 
				+                  | is specified.
			
 
				+*-----------------+-----------------------------------------------------------+
			
 
				+
			
 
				+* <<<mradmin>>>
			
 
				+
			
 
				+   Runs MR admin client
			
 
				+
			
 
				+   Usage: <<<hadoop mradmin [ GENERIC_OPTIONS ] [-refreshQueueAcls]>>>
			
 
				+
			
 
				+*-------------------+-----------------------------------------------------------+
			
 
				+|| COMMAND_OPTION   || Description
			
 
				+*-------------------+-----------------------------------------------------------+
			
 
				+| -refreshQueueAcls | Refresh the queue acls used by hadoop, to check access
			
 
				+                    | during submissions and administration of the job by the
			
 
				+                    | user. The properties present in mapred-queue-acls.xml is
			
 
				+                    | reloaded by the queue manager.
			
 
				+*-------------------+-----------------------------------------------------------+
			
 
				+
			
 
				+* <<<jobtracker>>>
			
 
				+
			
 
				+   Runs the MapReduce job Tracker node.
			
 
				+
			
 
				+   Usage: <<<hadoop jobtracker [-dumpConfiguration]>>>
			
 
				+
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+|| COMMAND_OPTION    || Description
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+| -dumpConfiguration | Dumps the configuration used by the JobTracker alongwith
			
 
				+                     | queue configuration in JSON format into Standard output
			
 
				+                     | used by the jobtracker and exits.
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+
			
 
				+* <<<namenode>>>
			
 
				+
			
 
				+   Runs the namenode. More info about the upgrade, rollback and finalize is
			
 
				+   at Upgrade Rollback
			
 
				+
			
 
				+   Usage: <<<hadoop namenode [-format] | [-upgrade] | [-rollback] | [-finalize] | [-importCheckpoint]>>>
			
 
				+
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+|| COMMAND_OPTION    || Description
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+| -format            | Formats the namenode. It starts the namenode, formats
			
 
				+                     | it and then shut it down.
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+| -upgrade           | Namenode should be started with upgrade option after
			
 
				+                     | the distribution of new hadoop version.
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+| -rollback          | Rollsback the namenode to the previous version. This
			
 
				+                     | should be used after stopping the cluster and
			
 
				+                     | distributing the old hadoop version.
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+| -finalize          | Finalize will remove the previous state of the files
			
 
				+                     | system. Recent upgrade will become permanent.  Rollback
			
 
				+                     | option will not be available anymore. After finalization
			
 
				+                     | it shuts the namenode down.
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+| -importCheckpoint  | Loads image from a checkpoint directory and save it
			
 
				+                     | into the current one. Checkpoint dir is read from
			
 
				+                     | property fs.checkpoint.dir
			
 
				+*--------------------+-----------------------------------------------------------+
			
 
				+
			
 
				+* <<<secondarynamenode>>>
			
 
				+
			
 
				+   Runs the HDFS secondary namenode. See Secondary Namenode for more
			
 
				+   info.
			
 
				+
			
 
				+   Usage: <<<hadoop secondarynamenode [-checkpoint [force]] | [-geteditsize]>>>
			
 
				+
			
 
				+*----------------------+-----------------------------------------------------------+
			
 
				+|| COMMAND_OPTION      || Description
			
 
				+*----------------------+-----------------------------------------------------------+
			
 
				+| -checkpoint [-force] | Checkpoints the Secondary namenode if EditLog size
			
 
				+                       | >= fs.checkpoint.size. If <<<-force>>> is used,
			
 
				+                       | checkpoint irrespective of EditLog size.
			
 
				+*----------------------+-----------------------------------------------------------+
			
 
				+| -geteditsize         | Prints the EditLog size.
			
 
				+*----------------------+-----------------------------------------------------------+
			
 
				+
			
 
				+* <<<tasktracker>>>
			
 
				+
			
 
				+   Runs a MapReduce task Tracker node.
			
 
				+
			
 
				+   Usage: <<<hadoop tasktracker>>>
			
--- a/hadoop-common-project/hadoop-common/src/site/apt/FileSystemShell.apt.vm
+++ b/hadoop-common-project/hadoop-common/src/site/apt/FileSystemShell.apt.vm
@@ -0,0 +1,418 @@
 
				+~~ Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				+~~ contributor license agreements.  See the NOTICE file distributed with
			
 
				+~~ this work for additional information regarding copyright ownership.
			
 
				+~~ The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				+~~ (the "License"); you may not use this file except in compliance with
			
 
				+~~ the License.  You may obtain a copy of the License at
			
 
				+~~
			
 
				+~~     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+~~
			
 
				+~~ Unless required by applicable law or agreed to in writing, software
			
 
				+~~ distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+~~ See the License for the specific language governing permissions and
			
 
				+~~ limitations under the License.
			
 
				+
			
 
				+  ---
			
 
				+  File System Shell Guide
			
 
				+  ---
			
 
				+  ---
			
 
				+  ${maven.build.timestamp}
			
 
				+
			
 
				+%{toc}
			
 
				+
			
 
				+Overview
			
 
				+
			
 
				+   The File System (FS) shell includes various shell-like commands that
			
 
				+   directly interact with the Hadoop Distributed File System (HDFS) as well as
			
 
				+   other file systems that Hadoop supports, such as Local FS, HFTP FS, S3 FS,
			
 
				+   and others. The FS shell is invoked by:
			
 
				+
			
 
				++---
			
 
				+bin/hadoop fs <args>
			
 
				++---
			
 
				+
			
 
				+   All FS shell commands take path URIs as arguments. The URI format is
			
 
				+   <<<scheme://authority/path>>>. For HDFS the scheme is <<<hdfs>>>, and for
			
 
				+   the Local FS the scheme is <<<file>>>. The scheme and authority are
			
 
				+   optional. If not specified, the default scheme specified in the
			
 
				+   configuration is used. An HDFS file or directory such as /parent/child can
			
 
				+   be specified as <<<hdfs://namenodehost/parent/child>>> or simply as
			
 
				+   <<</parent/child>>> (given that your configuration is set to point to
			
 
				+   <<<hdfs://namenodehost>>>).
			
 
				+
			
 
				+   Most of the commands in FS shell behave like corresponding Unix commands.
			
 
				+   Differences are described with each of the commands. Error information is
			
 
				+   sent to stderr and the output is sent to stdout.
			
 
				+
			
 
				+cat
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -cat URI [URI ...]>>>
			
 
				+
			
 
				+   Copies source paths to stdout.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -cat hdfs://nn1.example.com/file1 hdfs://nn2.example.com/file2>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -cat file:///file3 /user/hadoop/file4>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+chgrp
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -chgrp [-R] GROUP URI [URI ...]>>>
			
 
				+
			
 
				+   Change group association of files. With -R, make the change recursively
			
 
				+   through the directory structure. The user must be the owner of files, or
			
 
				+   else a super-user. Additional information is in the
			
 
				+   {{{betterurl}Permissions Guide}}.
			
 
				+
			
 
				+chmod
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -chmod [-R] <MODE[,MODE]... | OCTALMODE> URI [URI ...]>>>
			
 
				+
			
 
				+   Change the permissions of files. With -R, make the change recursively
			
 
				+   through the directory structure. The user must be the owner of the file, or
			
 
				+   else a super-user. Additional information is in the 
			
 
				+   {{{betterurl}Permissions Guide}}.
			
 
				+
			
 
				+chown
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -chown [-R] [OWNER][:[GROUP]] URI [URI ]>>>
			
 
				+
			
 
				+   Change the owner of files. With -R, make the change recursively through the
			
 
				+   directory structure. The user must be a super-user. Additional information
			
 
				+   is in the {{{betterurl}Permissions Guide}}.
			
 
				+
			
 
				+copyFromLocal
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -copyFromLocal <localsrc> URI>>>
			
 
				+
			
 
				+   Similar to put command, except that the source is restricted to a local
			
 
				+   file reference.
			
 
				+
			
 
				+copyToLocal
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -copyToLocal [-ignorecrc] [-crc] URI <localdst> >>>
			
 
				+
			
 
				+   Similar to get command, except that the destination is restricted to a
			
 
				+   local file reference.
			
 
				+
			
 
				+count
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -count [-q] <paths> >>>
			
 
				+
			
 
				+   Count the number of directories, files and bytes under the paths that match
			
 
				+   the specified file pattern.  The output columns with -count are: DIR_COUNT,
			
 
				+   FILE_COUNT, CONTENT_SIZE FILE_NAME
			
 
				+
			
 
				+   The output columns with -count -q are: QUOTA, REMAINING_QUATA, SPACE_QUOTA,
			
 
				+   REMAINING_SPACE_QUOTA, DIR_COUNT, FILE_COUNT, CONTENT_SIZE, FILE_NAME
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -count hdfs://nn1.example.com/file1 hdfs://nn2.example.com/file2>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -count -q hdfs://nn1.example.com/file1>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+cp
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -cp URI [URI ...] <dest> >>>
			
 
				+
			
 
				+   Copy files from source to destination. This command allows multiple sources
			
 
				+   as well in which case the destination must be a directory.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -cp /user/hadoop/file1 /user/hadoop/file2>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -cp /user/hadoop/file1 /user/hadoop/file2 /user/hadoop/dir>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+du
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -du [-s] [-h] URI [URI ...]>>>
			
 
				+
			
 
				+   Displays sizes of files and directories contained in the given directory or
			
 
				+   the length of a file in case its just a file.
			
 
				+
			
 
				+   Options:
			
 
				+
			
 
				+     * The -s option will result in an aggregate summary of file lengths being
			
 
				+       displayed, rather than the individual files.
			
 
				+
			
 
				+     * The -h option will format file sizes in a "human-readable" fashion (e.g
			
 
				+       64.0m instead of 67108864)
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+    * hdfs dfs -du /user/hadoop/dir1 /user/hadoop/file1 hdfs://nn.example.com/user/hadoop/dir1
			
 
				+
			
 
				+   Exit Code:
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+dus
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -dus <args> >>>
			
 
				+
			
 
				+   Displays a summary of file lengths. This is an alternate form of hdfs dfs -du -s.
			
 
				+
			
 
				+expunge
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -expunge>>>
			
 
				+
			
 
				+   Empty the Trash. Refer to the {{{betterurl}HDFS Architecture Guide}} for
			
 
				+   more information on the Trash feature.
			
 
				+
			
 
				+get
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -get [-ignorecrc] [-crc] <src> <localdst> >>>
			
 
				+
			
 
				+   Copy files to the local file system. Files that fail the CRC check may be
			
 
				+   copied with the -ignorecrc option. Files and CRCs may be copied using the
			
 
				+   -crc option.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -get /user/hadoop/file localfile>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -get hdfs://nn.example.com/user/hadoop/file localfile>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+getmerge
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -getmerge <src> <localdst> [addnl]>>>
			
 
				+
			
 
				+   Takes a source directory and a destination file as input and concatenates
			
 
				+   files in src into the destination local file. Optionally addnl can be set to
			
 
				+   enable adding a newline character at the
			
 
				+   end of each file.
			
 
				+
			
 
				+ls
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -ls <args> >>>
			
 
				+
			
 
				+   For a file returns stat on the file with the following format:
			
 
				+
			
 
				++---+
			
 
				+permissions number_of_replicas userid groupid filesize modification_date modification_time filename
			
 
				++---+
			
 
				+
			
 
				+   For a directory it returns list of its direct children as in unix.A directory is listed as:
			
 
				+
			
 
				++---+
			
 
				+permissions userid groupid modification_date modification_time dirname
			
 
				++---+
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -ls /user/hadoop/file1>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+lsr
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -lsr <args> >>>
			
 
				+
			
 
				+   Recursive version of ls. Similar to Unix ls -R.
			
 
				+
			
 
				+mkdir
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -mkdir [-p] <paths> >>>
			
 
				+
			
 
				+   Takes path uri's as argument and creates directories.  With -p the behavior
			
 
				+   is much like unix mkdir -p creating parent directories along the path.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -mkdir /user/hadoop/dir1 /user/hadoop/dir2>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -mkdir hdfs://nn1.example.com/user/hadoop/dir hdfs://nn2.example.com/user/hadoop/dir>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+moveFromLocal
			
 
				+
			
 
				+   Usage: <<<dfs -moveFromLocal <localsrc> <dst> >>>
			
 
				+
			
 
				+   Similar to put command, except that the source localsrc is deleted after
			
 
				+   it's copied.
			
 
				+
			
 
				+moveToLocal
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -moveToLocal [-crc] <src> <dst> >>>
			
 
				+
			
 
				+   Displays a "Not implemented yet" message.
			
 
				+
			
 
				+mv
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -mv URI [URI ...] <dest> >>>
			
 
				+
			
 
				+   Moves files from source to destination. This command allows multiple sources
			
 
				+   as well in which case the destination needs to be a directory. Moving files
			
 
				+   across file systems is not permitted.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -mv /user/hadoop/file1 /user/hadoop/file2>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -mv hdfs://nn.example.com/file1 hdfs://nn.example.com/file2 hdfs://nn.example.com/file3 hdfs://nn.example.com/dir1>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+put
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -put <localsrc> ... <dst> >>>
			
 
				+
			
 
				+   Copy single src, or multiple srcs from local file system to the destination
			
 
				+   file system. Also reads input from stdin and writes to destination file
			
 
				+   system.
			
 
				+
			
 
				+     * <<<hdfs dfs -put localfile /user/hadoop/hadoopfile>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -put localfile1 localfile2 /user/hadoop/hadoopdir>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -put localfile hdfs://nn.example.com/hadoop/hadoopfile>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -put - hdfs://nn.example.com/hadoop/hadoopfile>>>
			
 
				+       Reads the input from stdin.
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+rm
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -rm [-skipTrash] URI [URI ...]>>>
			
 
				+
			
 
				+   Delete files specified as args. Only deletes non empty directory and files.
			
 
				+   If the -skipTrash option is specified, the trash, if enabled, will be
			
 
				+   bypassed and the specified file(s) deleted immediately. This can be useful
			
 
				+   when it is necessary to delete files from an over-quota directory. Refer to
			
 
				+   rmr for recursive deletes.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -rm hdfs://nn.example.com/file /user/hadoop/emptydir>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+rmr
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -rmr [-skipTrash] URI [URI ...]>>>
			
 
				+
			
 
				+   Recursive version of delete. If the -skipTrash option is specified, the
			
 
				+   trash, if enabled, will be bypassed and the specified file(s) deleted
			
 
				+   immediately. This can be useful when it is necessary to delete files from an
			
 
				+   over-quota directory.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -rmr /user/hadoop/dir>>>
			
 
				+
			
 
				+     * <<<hdfs dfs -rmr hdfs://nn.example.com/user/hadoop/dir>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+setrep
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -setrep [-R] <path> >>>
			
 
				+
			
 
				+   Changes the replication factor of a file. -R option is for recursively
			
 
				+   increasing the replication factor of files within a directory.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -setrep -w 3 -R /user/hadoop/dir1>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+stat
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -stat URI [URI ...]>>>
			
 
				+
			
 
				+   Returns the stat information on the path.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -stat path>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+tail
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -tail [-f] URI>>>
			
 
				+
			
 
				+   Displays last kilobyte of the file to stdout. -f option can be used as in
			
 
				+   Unix.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -tail pathname>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+   Returns 0 on success and -1 on error.
			
 
				+
			
 
				+test
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -test -[ezd] URI>>>
			
 
				+
			
 
				+   Options:
			
 
				+
			
 
				+*----+------------+
			
 
				+| -e | check to see if the file exists. Return 0 if true.
			
 
				+*----+------------+
			
 
				+| -z | check to see if the file is zero length. Return 0 if true.
			
 
				+*----+------------+
			
 
				+| -d | check to see if the path is directory. Return 0 if true.
			
 
				+*----+------------+
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hdfs dfs -test -e filename>>>
			
 
				+
			
 
				+text
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -text <src> >>>
			
 
				+
			
 
				+   Takes a source file and outputs the file in text format. The allowed formats
			
 
				+   are zip and TextRecordInputStream.
			
 
				+
			
 
				+touchz
			
 
				+
			
 
				+   Usage: <<<hdfs dfs -touchz URI [URI ...]>>>
			
 
				+
			
 
				+   Create a file of zero length.
			
 
				+
			
 
				+   Example:
			
 
				+
			
 
				+     * <<<hadoop -touchz pathname>>>
			
 
				+
			
 
				+   Exit Code:
			
 
				+   Returns 0 on success and -1 on error.
			
--- a/hadoop-common-project/hadoop-common/src/site/apt/HttpAuthentication.apt.vm
+++ b/hadoop-common-project/hadoop-common/src/site/apt/HttpAuthentication.apt.vm
@@ -0,0 +1,99 @@
 
				+~~ Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+~~ you may not use this file except in compliance with the License.
			
 
				+~~ You may obtain a copy of the License at
			
 
				+~~
			
 
				+~~   http://www.apache.org/licenses/LICENSE-2.0
			
 
				+~~
			
 
				+~~ Unless required by applicable law or agreed to in writing, software
			
 
				+~~ distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+~~ See the License for the specific language governing permissions and
			
 
				+~~ limitations under the License. See accompanying LICENSE file.
			
 
				+
			
 
				+  ---
			
 
				+  Authentication for Hadoop HTTP web-consoles
			
 
				+  ---
			
 
				+  ---
			
 
				+  ${maven.build.timestamp}
			
 
				+
			
 
				+Authentication for Hadoop HTTP web-consoles
			
 
				+
			
 
				+%{toc|section=1|fromDepth=0}
			
 
				+
			
 
				+* Introduction
			
 
				+
			
 
				+   This document describes how to configure Hadoop HTTP web-consoles to
			
 
				+   require user authentication.
			
 
				+
			
 
				+   By default Hadoop HTTP web-consoles (JobTracker, NameNode, TaskTrackers
			
 
				+   and DataNodes) allow access without any form of authentication.
			
 
				+
			
 
				+   Similarly to Hadoop RPC, Hadoop HTTP web-consoles can be configured to
			
 
				+   require Kerberos authentication using HTTP SPNEGO protocol (supported
			
 
				+   by browsers like Firefox and Internet Explorer).
			
 
				+
			
 
				+   In addition, Hadoop HTTP web-consoles support the equivalent of
			
 
				+   Hadoop's Pseudo/Simple authentication. If this option is enabled, user
			
 
				+   must specify their user name in the first browser interaction using the
			
 
				+   user.name query string parameter. For example:
			
 
				+   <<<http://localhost:50030/jobtracker.jsp?user.name=babu>>>.
			
 
				+
			
 
				+   If a custom authentication mechanism is required for the HTTP
			
 
				+   web-consoles, it is possible to implement a plugin to support the
			
 
				+   alternate authentication mechanism (refer to Hadoop hadoop-auth for details
			
 
				+   on writing an <<<AuthenticatorHandler>>>).
			
 
				+
			
 
				+   The next section describes how to configure Hadoop HTTP web-consoles to
			
 
				+   require user authentication.
			
 
				+
			
 
				+* Configuration
			
 
				+
			
 
				+   The following properties should be in the <<<core-site.xml>>> of all the
			
 
				+   nodes in the cluster.
			
 
				+
			
 
				+   <<<hadoop.http.filter.initializers>>>: add to this property the
			
 
				+   <<<org.apache.hadoop.security.AuthenticationFilterInitializer>>> initializer
			
 
				+   class.
			
 
				+
			
 
				+   <<<hadoop.http.authentication.type>>>: Defines authentication used for the
			
 
				+   HTTP web-consoles. The supported values are: <<<simple>>> | <<<kerberos>>> |
			
 
				+   <<<#AUTHENTICATION_HANDLER_CLASSNAME#>>>. The dfeault value is <<<simple>>>.
			
 
				+
			
 
				+   <<<hadoop.http.authentication.token.validity>>>: Indicates how long (in
			
 
				+   seconds) an authentication token is valid before it has to be renewed.
			
 
				+   The default value is <<<36000>>>.
			
 
				+
			
 
				+   <<<hadoop.http.authentication.signature.secret.file>>>: The signature secret
			
 
				+   file for signing the authentication tokens. If not set a random secret is
			
 
				+   generated at startup time. The same secret should be used for all nodes
			
 
				+   in the cluster, JobTracker, NameNode, DataNode and TastTracker. The
			
 
				+   default value is <<<${user.home}/hadoop-http-auth-signature-secret>>>.
			
 
				+   IMPORTANT: This file should be readable only by the Unix user running the
			
 
				+   daemons.
			
 
				+
			
 
				+   <<<hadoop.http.authentication.cookie.domain>>>: The domain to use for the
			
 
				+   HTTP cookie that stores the authentication token. In order to
			
 
				+   authentiation to work correctly across all nodes in the cluster the
			
 
				+   domain must be correctly set. There is no default value, the HTTP
			
 
				+   cookie will not have a domain working only with the hostname issuing
			
 
				+   the HTTP cookie.
			
 
				+
			
 
				+   IMPORTANT: when using IP addresses, browsers ignore cookies with domain
			
 
				+   settings. For this setting to work properly all nodes in the cluster
			
 
				+   must be configured to generate URLs with <<<hostname.domain>>> names on it.
			
 
				+
			
 
				+   <<<hadoop.http.authentication.simple.anonymous.allowed>>>: Indicates if
			
 
				+   anonymous requests are allowed when using 'simple' authentication. The
			
 
				+   default value is <<<true>>>
			
 
				+
			
 
				+   <<<hadoop.http.authentication.kerberos.principal>>>: Indicates the Kerberos
			
 
				+   principal to be used for HTTP endpoint when using 'kerberos'
			
 
				+   authentication. The principal short name must be <<<HTTP>>> per Kerberos HTTP
			
 
				+   SPNEGO specification. The default value is <<<HTTP/_HOST@$LOCALHOST>>>,
			
 
				+   where <<<_HOST>>> -if present- is replaced with bind address of the HTTP
			
 
				+   server.
			
 
				+
			
 
				+   <<<hadoop.http.authentication.kerberos.keytab>>>: Location of the keytab file
			
 
				+   with the credentials for the Kerberos principal used for the HTTP
			
 
				+   endpoint. The default value is <<<${user.home}/hadoop.keytab>>>.i
			
 
				+
			
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/content/xdocs/hdfs_design.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/content/xdocs/hdfs_design.xml
@@ -1,536 +0,0 @@
 
				-<?xml version="1.0"?>
			
 
				-<!--
			
 
				-  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				-  contributor license agreements.  See the NOTICE file distributed with
			
 
				-  this work for additional information regarding copyright ownership.
			
 
				-  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				-  (the "License"); you may not use this file except in compliance with
			
 
				-  the License.  You may obtain a copy of the License at
			
 
				-
			
 
				-      http://www.apache.org/licenses/LICENSE-2.0
			
 
				-
			
 
				-  Unless required by applicable law or agreed to in writing, software
			
 
				-  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-  See the License for the specific language governing permissions and
			
 
				-  limitations under the License.
			
 
				--->
			
 
				-
			
 
				-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
			
 
				-          "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				-
			
 
				-<document>
			
 
				-
			
 
				-  <header>
			
 
				-    <title> 
			
 
				-      HDFS Architecture Guide
			
 
				-    </title>
			
 
				-    <authors>
			
 
				-      <person name="Dhruba Borthakur" email="dhruba@yahoo-inc.com"/>
			
 
				-    </authors> 
			
 
				-  </header>
			
 
				-
			
 
				-  <body>
			
 
				-    <section>
			
 
				-      <title> Introduction </title>
			
 
				-      <p>
			
 
				-      The Hadoop Distributed File System (<acronym title="Hadoop Distributed File System">HDFS</acronym>) is a distributed file system 
			
 
				-      designed to run on commodity hardware. It has many similarities with existing distributed file systems. However, the differences from 
			
 
				-      other distributed file systems are significant. HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware. 
			
 
				-      HDFS provides high throughput access to application data and is suitable for applications that have large data sets. HDFS relaxes 
			
 
				-      a few POSIX requirements to enable streaming access to file system data.  HDFS was originally built as infrastructure for the 
			
 
				-      Apache Nutch web search engine project. HDFS is now an Apache Hadoop subproject.
			
 
				-      The project URL is <a href="http://hadoop.apache.org/hdfs/">http://hadoop.apache.org/hdfs/</a>.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-
			
 
				-    <section> 
			
 
				-      <title> Assumptions and Goals </title>
			
 
				-
			
 
				-      <section> 
			
 
				-        <title> Hardware Failure </title>
			
 
				-        <p>
			
 
				-        Hardware failure is the norm rather than the exception. An HDFS instance may consist of hundreds or thousands of server machines, 
			
 
				-        each storing part of the file system&#x2019;s data. The fact that there are a huge number of components and that each component has 
			
 
				-        a non-trivial probability of failure means that some component of HDFS is always non-functional. Therefore, detection of faults and quick, 
			
 
				-        automatic recovery from them is a core architectural goal of HDFS.
			
 
				-       </p>
			
 
				-     </section>
			
 
				-
			
 
				- 
			
 
				-      <section> 
			
 
				-        <title> Streaming Data Access </title>
			
 
				-        <p>
			
 
				-        Applications that run on HDFS need streaming access to their data sets. They are not general purpose applications that typically run 
			
 
				-        on general purpose file systems. HDFS is designed more for batch processing rather than interactive use by users. The emphasis is on 
			
 
				-        high throughput of data access rather than low latency of data access. POSIX imposes many hard requirements that are not needed for 
			
 
				-        applications that are targeted for HDFS. POSIX semantics in a few key areas has been traded to increase data throughput rates. 
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section> 
			
 
				-        <title> Large Data Sets </title>
			
 
				-        <p>
			
 
				-        Applications that run on HDFS have large data sets. A typical file in HDFS is gigabytes to terabytes in size. Thus, HDFS is tuned to 
			
 
				-        support large files. It should provide high aggregate data bandwidth and scale to thousands of nodes in a single cluster. It should support 
			
 
				-        tens of millions of files in a single instance.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				- 
			
 
				-      <section> 
			
 
				-        <title> Appending-Writes and File Syncs </title>
			
 
				-        <p>
			
 
				-        Most HDFS applications need a write-once-read-many access model for files. HDFS provides two additional advanced features: hflush and
			
 
				-        append.  Hflush makes the last block of an unclosed file visible to readers while providing read consistency and data durability.  Append
			
 
				-        provides a mechanism for opening a closed file to add additional data.
			
 
				-        </p>
			
 
				-        <p>
			
 
				-        For complete details of the hflush and append design, see the 
			
 
				-        <a href="https://issues.apache.org/jira/secure/attachment/12445209/appendDesign3.pdf">Append/Hflush/Read Design document</a> (PDF).
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				- 
			
 
				-      <section> 
			
 
				-        <title> &#x201c;Moving Computation is Cheaper than Moving Data&#x201d; </title>
			
 
				-        <p>
			
 
				-        A computation requested by an application is much more efficient if it is executed near the data it operates on. This is especially true 
			
 
				-        when the size of the data set is huge. This minimizes network congestion and increases the overall throughput of the system. The 
			
 
				-        assumption is that it is often better to migrate the computation closer to where the data is located rather than moving the data to where 
			
 
				-        the application is running. HDFS provides interfaces for applications to move themselves closer to where the data is located. 
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-
			
 
				-      <section> 
			
 
				-        <title> Portability Across Heterogeneous Hardware and Software Platforms </title>
			
 
				-        <p>
			
 
				-        HDFS has been designed to be easily portable from one platform to another. This facilitates widespread adoption of HDFS as a 
			
 
				-        platform of choice for a large set of applications. 
			
 
				-        </p>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-
			
 
				- 
			
 
				-    <section>
			
 
				-      <title> NameNode and DataNodes </title>
			
 
				-      <p>
			
 
				-      HDFS has a master/slave architecture. An HDFS cluster consists of a single NameNode, a master server that manages the file 
			
 
				-      system namespace and regulates access to files by clients. In addition, there are a number of DataNodes, usually one per node 
			
 
				-      in the cluster, which manage storage attached to the nodes that they run on. HDFS exposes a file system namespace and allows 
			
 
				-      user data to be stored in files. Internally, a file is split into one or more blocks and these blocks are stored in a set of DataNodes. 
			
 
				-      The NameNode executes file system namespace operations like opening, closing, and renaming files and directories. It also 
			
 
				-      determines the mapping of blocks to DataNodes. The DataNodes are responsible for serving read and write requests from the file 
			
 
				-      system&#x2019;s clients. The DataNodes also perform block creation, deletion, and replication upon instruction from the NameNode.
			
 
				-      </p>
			
 
				-      
			
 
				-           <figure alt="HDFS Architecture" src="images/hdfsarchitecture.gif"/>
			
 
				-
			
 
				-      <p>
			
 
				-      The NameNode and DataNode are pieces of software designed to run on commodity machines. These machines typically run a 
			
 
				-      GNU/Linux operating system (<acronym title="operating system">OS</acronym>). HDFS is built using the Java language; any 
			
 
				-      machine that supports Java can run the NameNode or the DataNode software. Usage of the highly portable Java language means 
			
 
				-      that HDFS can be deployed on a wide range of machines. A typical deployment has a dedicated machine that runs only the 
			
 
				-      NameNode software. Each of the other machines in the cluster runs one instance of the DataNode software. The architecture 
			
 
				-      does not preclude running multiple DataNodes on the same machine but in a real deployment that is rarely the case.
			
 
				-      </p>
			
 
				-      <p>
			
 
				-      The existence of a single NameNode in a cluster greatly simplifies the architecture of the system. The NameNode is the arbitrator 
			
 
				-      and repository for all HDFS metadata. The system is designed in such a way that user data never flows through the NameNode.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-
			
 
				- 
			
 
				-
			
 
				-    <section>
			
 
				-      <title> The File System Namespace </title>
			
 
				-      <p>
			
 
				-      HDFS supports a traditional hierarchical file organization. A user or an application can create directories and store files inside 
			
 
				-      these directories. The file system namespace hierarchy is similar to most other existing file systems; one can create and 
			
 
				-      remove files, move a file from one directory to another, or rename a file. HDFS implements user quotas for number of names and 
			
 
				-      amount of data stored in a particular directory (See 
			
 
				-      <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_quota_admin_guide.html">HDFS Quota Admin Guide</a>). In addition, HDFS
			
 
				-      supports <a href="http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/fs/FileContext.html#createSymlink(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, boolean)">symbolic links</a>.
			
 
				-      </p>
			
 
				-      <p>
			
 
				-      The NameNode maintains the file system namespace. Any change to the file system namespace or its properties is 
			
 
				-      recorded by the NameNode. An application can specify the number of replicas of a file that should be maintained by 
			
 
				-      HDFS. The number of copies of a file is called the replication factor of that file. This information is stored by the NameNode.
			
 
				-      </p>
			
 
				-    </section>
			
 
				-
			
 
				- 
			
 
				-
			
 
				-    <section> 
			
 
				-      <title> Data Replication </title>
			
 
				-      <p>
			
 
				-      HDFS is designed to reliably store very large files across machines in a large cluster. It stores each file as a sequence 
			
 
				-      of blocks; all blocks in a file except the last block are the same size. The blocks of a file are replicated for fault tolerance. 
			
 
				-      The block size and replication factor are configurable per file. An application can specify the number of replicas of a file. 
			
 
				-      The replication factor can be specified at file creation time and can be changed later. Files in HDFS are strictly one writer at any 
			
 
				-      time. 
			
 
				-      </p>
			
 
				-      <p>
			
 
				-      The NameNode makes all decisions regarding replication of blocks. It periodically receives a Heartbeat and a Blockreport 
			
 
				-      from each of the DataNodes in the cluster. Receipt of a Heartbeat implies that the DataNode is functioning properly. A 
			
 
				-      Blockreport contains a list of all blocks on a DataNode. 
			
 
				-    </p>
			
 
				-    <figure alt="HDFS DataNodes" src="images/hdfsdatanodes.gif"/>
			
 
				-
			
 
				-      <section>
			
 
				-        <title> Replica Placement: The First Baby Steps </title>
			
 
				-        <p>
			
 
				-        The placement of replicas is critical to HDFS reliability and performance. Optimizing replica placement distinguishes 
			
 
				-        HDFS from most other distributed file systems. This is a feature that needs lots of tuning and experience. The purpose 
			
 
				-        of a rack-aware replica placement policy is to improve data reliability, availability, and network bandwidth utilization. 
			
 
				-        The current implementation for the replica placement policy is a first effort in this direction. The short-term goals of 
			
 
				-        implementing this policy are to validate it on production systems, learn more about its behavior, and build a foundation 
			
 
				-        to test and research more sophisticated policies. 
			
 
				-        </p>
			
 
				-        <p>
			
 
				-        Large HDFS instances run on a cluster of computers that commonly spread across many racks. Communication 
			
 
				-        between two nodes in different racks has to go through switches. In most cases, network bandwidth between machines 
			
 
				-        in the same rack is greater than network bandwidth between machines in different racks.  
			
 
				-        </p>
			
 
				-        <p>
			
 
				-        The NameNode determines the rack id each DataNode belongs to via the process outlined in 
			
 
				-        <a href="http://hadoop.apache.org/common/docs/current/cluster_setup.html#Hadoop+Rack+Awareness">Hadoop Rack Awareness</a>. 
			
 
				-        A simple but non-optimal policy is to place replicas on unique racks. This prevents losing data when an entire rack 
			
 
				-        fails and allows use of bandwidth from multiple racks when reading data. This policy evenly distributes replicas in 
			
 
				-        the cluster which makes it easy to balance load on component failure. However, this policy increases the cost of 
			
 
				-        writes because a write needs to transfer blocks to multiple racks. 
			
 
				-        </p>
			
 
				-        <p>
			
 
				-        For the common case, when the replication factor is three, HDFS&#x2019;s placement policy is to put one replica 
			
 
				-        on one node in the local rack, another on a node in a different (remote) rack, and the last on a different node in the 
			
 
				-        same remote rack. This policy cuts the inter-rack write traffic which generally improves write performance. The 
			
 
				-        chance of rack failure is far less than that of node failure; this policy does not impact data reliability and availability 
			
 
				-        guarantees. However, it does reduce the aggregate network bandwidth used when reading data since a block is 
			
 
				-        placed in only two unique racks rather than three. With this policy, the replicas of a file do not evenly distribute 
			
 
				-        across the racks. One third of replicas are on one node, two thirds of replicas are on one rack, and the other third 
			
 
				-        are evenly distributed across the remaining racks. This policy improves write performance without compromising 
			
 
				-        data reliability or read performance.
			
 
				-        </p>
			
 
				-        <p>
			
 
				-        In addition to the default placement policy described above, HDFS also provides a pluggable interface for block placement. See
			
 
				-        <a href="http://hadoop.apache.org/hdfs/docs/current/api/org/apache/hadoop/hdfs/server/namenode/BlockPlacementPolicy.html">BlockPlacementPolicy</a>.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section> 
			
 
				-        <title> Replica Selection </title>
			
 
				-        <p>
			
 
				-        To minimize global bandwidth consumption and read latency, HDFS tries to satisfy a read request from a replica 
			
 
				-        that is closest to the reader. If there exists a replica on the same rack as the reader node, then that replica is 
			
 
				-        preferred to satisfy the read request. If an HDFS cluster spans multiple data centers, then a replica that is 
			
 
				-        resident in the local data center is preferred over any remote replica.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section> 
			
 
				-        <title> Safemode </title>
			
 
				-        <p>
			
 
				-        On startup, the NameNode enters a special state called Safemode. Replication of data blocks does not occur 
			
 
				-        when the NameNode is in the Safemode state. The NameNode receives Heartbeat and Blockreport messages 
			
 
				-        from the DataNodes. A Blockreport contains the list of data blocks that a DataNode is hosting. Each block 
			
 
				-        has a specified minimum number of replicas. A block is considered safely replicated when the minimum number 
			
 
				-        of replicas of that data block has checked in with the NameNode. After a configurable percentage of safely 
			
 
				-        replicated data blocks checks in with the NameNode (plus an additional 30 seconds), the NameNode exits 
			
 
				-        the Safemode state. It then determines the list of data blocks (if any) that still have fewer than the specified 
			
 
				-        number of replicas. The NameNode then replicates these blocks to other DataNodes.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <title> The Persistence of File System Metadata </title>
			
 
				-        <p>
			
 
				-        The HDFS namespace is stored by the NameNode. The NameNode uses a transaction log called the EditLog 
			
 
				-        to persistently record every change that occurs to file system metadata. For example, creating a new file in 
			
 
				-        HDFS causes the NameNode to insert a record into the EditLog indicating this. Similarly, changing the 
			
 
				-        replication factor of a file causes a new record to be inserted into the EditLog. The NameNode uses a file 
			
 
				-        in its local host OS file system to store the EditLog. The entire file system namespace, including the mapping 
			
 
				-        of blocks to files and file system properties, is stored in a file called the FsImage. The FsImage is stored as 
			
 
				-        a file in the NameNode&#x2019;s local file system too.
			
 
				-        </p>
			
 
				-        <p>
			
 
				-        The NameNode keeps an image of the entire file system namespace and file Blockmap in memory. This key 
			
 
				-        metadata item is designed to be compact, such that a NameNode with 4 GB of RAM is plenty to support a 
			
 
				-        huge number of files and directories. When the NameNode starts up, it reads the FsImage and EditLog from 
			
 
				-        disk, applies all the transactions from the EditLog to the in-memory representation of the FsImage, and flushes 
			
 
				-        out this new version into a new FsImage on disk. It can then truncate the old EditLog because its transactions 
			
 
				-        have been applied to the persistent FsImage. This process is called a checkpoint. The 
			
 
				-        <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Checkpoint+Node">Checkpoint Node</a> is a 
			
 
				-        separate daemon that can be configured to periodically build checkpoints from the FsImage and EditLog which are 
			
 
				-        uploaded to the NameNode.  The 
			
 
				-        <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Backup+Node">Backup Node</a> builds 
			
 
				-        checkpoints like the Checkpoint Node and also maintains an up-to-date copy of the FsImage in memory.
			
 
				-        </p>
			
 
				-        <p>
			
 
				-        The DataNode stores HDFS data in files in its local file system. The DataNode has no knowledge about HDFS files. 
			
 
				-        It stores each block of HDFS data in a separate file in its local file system. The DataNode does not create all files 
			
 
				-        in the same directory. Instead, it uses a heuristic to determine the optimal number of files per directory and creates 
			
 
				-        subdirectories appropriately. It is not optimal to create all local files in the same directory because the local file 
			
 
				-        system might not be able to efficiently support a huge number of files in a single directory. When a DataNode starts 
			
 
				-        up, it scans through its local file system, generates a list of all HDFS data blocks that correspond to each of these 
			
 
				-        local files and sends this report to the NameNode: this is the Blockreport. 
			
 
				-        </p>
			
 
				-    </section>
			
 
				-
			
 
				-
			
 
				-    <section> 
			
 
				-      <title> The Communication Protocols </title>
			
 
				-      <p>
			
 
				-      All HDFS communication protocols are layered on top of the TCP/IP protocol. A client establishes a connection to 
			
 
				-      a configurable <acronym title="Transmission Control Protocol">TCP</acronym> port on the NameNode machine. 
			
 
				-      It talks the ClientProtocol with the NameNode. The DataNodes talk to the NameNode using the DataNode Protocol. 
			
 
				-      A Remote Procedure Call (<acronym title="Remote Procedure Call">RPC</acronym>) abstraction wraps both the 
			
 
				-      Client Protocol and the DataNode Protocol. By design, the NameNode never initiates any RPCs. Instead, it only 
			
 
				-      responds to RPC requests issued by DataNodes or clients. 
			
 
				-      </p>
			
 
				-    </section>
			
 
				- 
			
 
				-
			
 
				-    <section> 
			
 
				-      <title> Robustness </title>
			
 
				-      <p>
			
 
				-      The primary objective of HDFS is to store data reliably even in the presence of failures. The three common types 
			
 
				-      of failures are NameNode failures, DataNode failures and network partitions.
			
 
				-      </p>
			
 
				- 
			
 
				-      <section>
			
 
				-        <title> Data Disk Failure, Heartbeats and Re-Replication </title>
			
 
				-        <p>
			
 
				-        Each DataNode sends a Heartbeat message to the NameNode periodically. A network partition can cause a 
			
 
				-        subset of DataNodes to lose connectivity with the NameNode. The NameNode detects this condition by the 
			
 
				-        absence of a Heartbeat message. The NameNode marks DataNodes without recent Heartbeats as dead and 
			
 
				-        does not forward any new <acronym title="Input/Output">IO</acronym> requests to them. Any data that was 
			
 
				-        registered to a dead DataNode is not available to HDFS any more. DataNode death may cause the replication 
			
 
				-        factor of some blocks to fall below their specified value. The NameNode constantly tracks which blocks need 
			
 
				-        to be replicated and initiates replication whenever necessary. The necessity for re-replication may arise due 
			
 
				-        to many reasons: a DataNode may become unavailable, a replica may become corrupted, a hard disk on a 
			
 
				-        DataNode may fail, or the replication factor of a file may be increased. 
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title> Cluster Rebalancing </title>
			
 
				-        <p>
			
 
				-        The HDFS architecture is compatible with data rebalancing schemes. A scheme might automatically move 
			
 
				-        data from one DataNode to another if the free space on a DataNode falls below a certain threshold. In the 
			
 
				-        event of a sudden high demand for a particular file, a scheme might dynamically create additional replicas 
			
 
				-        and rebalance other data in the cluster. These types of data rebalancing schemes are not yet implemented. 
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title> Data Integrity </title>
			
 
				-        <p>
			
 
				-        <!-- XXX "checksum checking" sounds funny -->
			
 
				-        It is possible that a block of data fetched from a DataNode arrives corrupted. This corruption can occur 
			
 
				-        because of faults in a storage device, network faults, or buggy software. The HDFS client software 
			
 
				-        implements checksum checking on the contents of HDFS files. When a client creates an HDFS file, 
			
 
				-        it computes a checksum of each block of the file and stores these checksums in a separate hidden 
			
 
				-        file in the same HDFS namespace. When a client retrieves file contents it verifies that the data it 
			
 
				-        received from each DataNode matches the checksum stored in the associated checksum file. If not, 
			
 
				-        then the client can opt to retrieve that block from another DataNode that has a replica of that block.
			
 
				-        </p>
			
 
				-      </section>
			
 
				- 
			
 
				-
			
 
				-      <section>
			
 
				-        <title> Metadata Disk Failure </title>
			
 
				-        <p>
			
 
				-        The FsImage and the EditLog are central data structures of HDFS. A corruption of these files can 
			
 
				-        cause the HDFS instance to be non-functional. For this reason, the NameNode can be configured 
			
 
				-        to support maintaining multiple copies of the FsImage and EditLog. Any update to either the FsImage 
			
 
				-        or EditLog causes each of the FsImages and EditLogs to get updated synchronously. This 
			
 
				-        synchronous updating of multiple copies of the FsImage and EditLog may degrade the rate of 
			
 
				-        namespace transactions per second that a NameNode can support. However, this degradation is 
			
 
				-        acceptable because even though HDFS applications are very data intensive in nature, they are not 
			
 
				-        metadata intensive. When a NameNode restarts, it selects the latest consistent FsImage and EditLog to use.
			
 
				-        </p>
			
 
				-        <p> 
			
 
				-        The NameNode machine is a single point of failure for an HDFS cluster. If the NameNode machine fails, 
			
 
				-        manual intervention is necessary. Currently, automatic restart and failover of the NameNode software to 
			
 
				-        another machine is not supported.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title> Snapshots </title>
			
 
				-        <p>
			
 
				-        Snapshots support storing a copy of data at a particular instant of time. One usage of the snapshot 
			
 
				-        feature may be to roll back a corrupted HDFS instance to a previously known good point in time. 
			
 
				-        HDFS does not currently support snapshots but will in a future release.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-    </section>
			
 
				- 
			
 
				-
			
 
				-    <section> 
			
 
				-      <!-- XXX Better name -->
			
 
				-      <title> Data Organization </title>
			
 
				-
			
 
				-      <section>
			
 
				-        <title> Data Blocks </title>
			
 
				-        <p>
			
 
				-        HDFS is designed to support very large files. Applications that are compatible with HDFS are those 
			
 
				-        that deal with large data sets. These applications write their data only once but they read it one or 
			
 
				-        more times and require these reads to be satisfied at streaming speeds. HDFS supports 
			
 
				-        write-once-read-many semantics on files. A typical block size used by HDFS is 64 MB. Thus, 
			
 
				-        an HDFS file is chopped up into 64 MB chunks, and if possible, each chunk will reside on a different DataNode.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title> Replication Pipelining </title>
			
 
				-        <p>
			
 
				-        When a client is writing data to an HDFS file with a replication factor of 3, the NameNode retrieves a list of DataNodes using a replication target choosing algorithm.
			
 
				-        This list contains the DataNodes that will host a replica of that block. The client then writes to the first DataNode. The first DataNode starts receiving the data in small portions (64 KB, configurable), 
			
 
				-        writes each portion to its local repository and transfers that portion to the second DataNode in the list. 
			
 
				-        The second DataNode, in turn starts receiving each portion of the data block, writes that portion to its 
			
 
				-        repository and then flushes that portion to the third DataNode. Finally, the third DataNode writes the 
			
 
				-        data to its local repository. Thus, a DataNode can be receiving data from the previous one in the pipeline 
			
 
				-        and at the same time forwarding data to the next one in the pipeline. Thus, the data is pipelined from 
			
 
				-        one DataNode to the next.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-    </section>
			
 
				-
			
 
				-    <section>
			
 
				-      <!-- XXX "Accessibility" sounds funny - "Interfaces" ? -->
			
 
				-      <title> Accessibility </title>
			
 
				-      <!-- XXX Make an API section ? (HTTP is "web service" API?) -->
			
 
				-      <p>
			
 
				-      HDFS can be accessed from applications in many different ways. Natively, HDFS provides a 
			
 
				-      <a href="http://hadoop.apache.org/core/docs/current/api/">Java API</a> for applications to 
			
 
				-      use. A C language wrapper for this Java API is also available. In addition, an HTTP browser 
			
 
				-      can also be used to browse the files of an HDFS instance. Work is in progress to expose 
			
 
				-      HDFS through the <acronym title="Web-based Distributed Authoring and Versioning">WebDAV</acronym> protocol. 
			
 
				-      </p>
			
 
				-
			
 
				-      <section>
			
 
				-        <title> FS Shell </title>
			
 
				-        <p>
			
 
				-        HDFS allows user data to be organized in the form of files and directories. It provides a commandline 
			
 
				-        interface called  FS shell that lets a user interact with the data in HDFS. The syntax of this command 
			
 
				-        set is similar to other shells (e.g. bash, csh) that users are already familiar with. Here are some sample 
			
 
				-        action/command pairs:
			
 
				-        </p>
			
 
				-        <table>
			
 
				-          <tr>
			
 
				-            <th> Action </th><th> Command </th>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td> Create a directory named <code>/foodir</code> </td> 
			
 
				-            <td> <code>bin/hadoop dfs -mkdir /foodir</code> </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td> Remove a directory named <code>/foodir</code> </td> 
			
 
				-            <td> <code>bin/hadoop dfs -rmr /foodir</code> </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td> View the contents of a file named <code>/foodir/myfile.txt</code> </td> 
			
 
				-            <td> <code>bin/hadoop dfs -cat /foodir/myfile.txt</code> </td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-        <p>
			
 
				-        FS shell is targeted for applications that need a scripting language to interact with the stored data.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section> 
			
 
				-        <title> DFSAdmin </title>
			
 
				-        <p>
			
 
				-        The DFSAdmin command set is used for administering an HDFS cluster. These are commands that are 
			
 
				-        used only by an HDFS administrator. Here are some sample action/command pairs:
			
 
				-        </p>
			
 
				-        <table>
			
 
				-          <tr>
			
 
				-            <th> Action </th><th> Command </th>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td> Put the cluster in Safemode </td> <td> <code>bin/hadoop dfsadmin -safemode enter</code> </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td> Generate a list of DataNodes </td> <td> <code>bin/hadoop dfsadmin -report</code> </td>
			
 
				-          </tr>
			
 
				-          <tr>
			
 
				-            <td> Recommission or decommission DataNode(s) </td>
			
 
				-            <td> <code>bin/hadoop dfsadmin -refreshNodes</code> </td>
			
 
				-          </tr>
			
 
				-        </table>
			
 
				-      </section>
			
 
				-
			
 
				-      <section> 
			
 
				-        <title> Browser Interface </title>
			
 
				-        <p>
			
 
				-        A typical HDFS install configures a web server to expose the HDFS namespace through 
			
 
				-        a configurable TCP port. This allows a user to navigate the HDFS namespace and view 
			
 
				-        the contents of its files using a web browser.
			
 
				-       </p>
			
 
				-      </section>
			
 
				-
			
 
				-    </section> 
			
 
				-
			
 
				-    <section> 
			
 
				-      <title> Space Reclamation </title>
			
 
				-
			
 
				-      <section>
			
 
				-        <title> File Deletes and Undeletes </title>
			
 
				-        <p>
			
 
				-        When a file is deleted by a user or an application, it is not immediately removed from HDFS.  Instead, 
			
 
				-        HDFS first renames it to a file in the <code>/trash</code> directory. The file can be restored quickly 
			
 
				-        as long as it remains in <code>/trash</code>. A file remains in <code>/trash</code> for a configurable 
			
 
				-        amount of time. After the expiry of its life in <code>/trash</code>, the NameNode deletes the file from 
			
 
				-        the HDFS namespace. The deletion of a file causes the blocks associated with the file to be freed. 
			
 
				-        Note that there could be an appreciable time delay between the time a file is deleted by a user and 
			
 
				-        the time of the corresponding increase in free space in HDFS.
			
 
				-        </p>
			
 
				-        <p>
			
 
				-        A user can Undelete a file after deleting it as long as it remains in the <code>/trash</code> directory. 
			
 
				-        If a user wants to undelete a file that he/she has deleted, he/she can navigate the <code>/trash</code> 
			
 
				-        directory and retrieve the file. The <code>/trash</code> directory contains only the latest copy of the file 
			
 
				-        that was deleted. The <code>/trash</code> directory is just like any other directory with one special 
			
 
				-        feature: HDFS applies specified policies to automatically delete files from this directory.
			
 
				-        By default, the trash feature is disabled. It can be enabled by setting the <em>fs.trash.interval</em> property in core-site.xml to a non-zero value (set as minutes of retention required). The property needs to exist on both client and server side configurations.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-
			
 
				-      <section>
			
 
				-        <title> Decrease Replication Factor </title>
			
 
				-        <p>
			
 
				-        When the replication factor of a file is reduced, the NameNode selects excess replicas that can be deleted. 
			
 
				-        The next Heartbeat transfers this information to the DataNode. The DataNode then removes the corresponding 
			
 
				-        blocks and the corresponding free space appears in the cluster. Once again, there might be a time delay 
			
 
				-        between the completion of the <code>setReplication</code> API call and the appearance of free space in the cluster.
			
 
				-        </p>
			
 
				-      </section>
			
 
				-    </section>
			
 
				-
			
 
				-
			
 
				-    <section>
			
 
				-      <title> References </title>
			
 
				-      <p>
			
 
				-      HDFS Java API: 
			
 
				-      <a href="http://hadoop.apache.org/core/docs/current/api/"> 
			
 
				-        http://hadoop.apache.org/core/docs/current/api/
			
 
				-      </a>
			
 
				-      </p>
			
 
				-      <p>
			
 
				-      HDFS source code: 
			
 
				-      <a href= "http://hadoop.apache.org/hdfs/version_control.html"> 
			
 
				-        http://hadoop.apache.org/hdfs/version_control.html
			
 
				-      </a>
			
 
				-      </p>
			
 
				-    </section> 
			
 
				-
			
 
				-  </body>
			
 
				-</document>
			
 
				-
			
--- a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HdfsDesign.apt.vm
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HdfsDesign.apt.vm
@@ -0,0 +1,512 @@
 
				+~~ Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+~~ you may not use this file except in compliance with the License.
			
 
				+~~ You may obtain a copy of the License at
			
 
				+~~
			
 
				+~~   http://www.apache.org/licenses/LICENSE-2.0
			
 
				+~~
			
 
				+~~ Unless required by applicable law or agreed to in writing, software
			
 
				+~~ distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+~~ See the License for the specific language governing permissions and
			
 
				+~~ limitations under the License. See accompanying LICENSE file.
			
 
				+
			
 
				+  ---
			
 
				+  HDFS Architecture
			
 
				+  ---
			
 
				+  Dhruba Borthakur
			
 
				+  ---
			
 
				+  ${maven.build.timestamp}
			
 
				+
			
 
				+%{toc|section=1|fromDepth=0}
			
 
				+
			
 
				+HDFS Architecture
			
 
				+
			
 
				+Introduction
			
 
				+
			
 
				+   The Hadoop Distributed File System (HDFS) is a distributed file system
			
 
				+   designed to run on commodity hardware. It has many similarities with
			
 
				+   existing distributed file systems. However, the differences from other
			
 
				+   distributed file systems are significant. HDFS is highly fault-tolerant
			
 
				+   and is designed to be deployed on low-cost hardware. HDFS provides high
			
 
				+   throughput access to application data and is suitable for applications
			
 
				+   that have large data sets. HDFS relaxes a few POSIX requirements to
			
 
				+   enable streaming access to file system data. HDFS was originally built
			
 
				+   as infrastructure for the Apache Nutch web search engine project. HDFS
			
 
				+   is part of the Apache Hadoop Core project. The project URL is
			
 
				+   {{http://hadoop.apache.org/}}.
			
 
				+
			
 
				+Assumptions and Goals
			
 
				+
			
 
				+Hardware Failure
			
 
				+
			
 
				+   Hardware failure is the norm rather than the exception. An HDFS
			
 
				+   instance may consist of hundreds or thousands of server machines, each
			
 
				+   storing part of the file system’s data. The fact that there are a huge
			
 
				+   number of components and that each component has a non-trivial
			
 
				+   probability of failure means that some component of HDFS is always
			
 
				+   non-functional. Therefore, detection of faults and quick, automatic
			
 
				+   recovery from them is a core architectural goal of HDFS.
			
 
				+
			
 
				+Streaming Data Access
			
 
				+
			
 
				+   Applications that run on HDFS need streaming access to their data sets.
			
 
				+   They are not general purpose applications that typically run on general
			
 
				+   purpose file systems. HDFS is designed more for batch processing rather
			
 
				+   than interactive use by users. The emphasis is on high throughput of
			
 
				+   data access rather than low latency of data access. POSIX imposes many
			
 
				+   hard requirements that are not needed for applications that are
			
 
				+   targeted for HDFS. POSIX semantics in a few key areas has been traded
			
 
				+   to increase data throughput rates.
			
 
				+
			
 
				+Large Data Sets
			
 
				+
			
 
				+   Applications that run on HDFS have large data sets. A typical file in
			
 
				+   HDFS is gigabytes to terabytes in size. Thus, HDFS is tuned to support
			
 
				+   large files. It should provide high aggregate data bandwidth and scale
			
 
				+   to hundreds of nodes in a single cluster. It should support tens of
			
 
				+   millions of files in a single instance.
			
 
				+
			
 
				+Simple Coherency Model
			
 
				+
			
 
				+   HDFS applications need a write-once-read-many access model for files. A
			
 
				+   file once created, written, and closed need not be changed. This
			
 
				+   assumption simplifies data coherency issues and enables high throughput
			
 
				+   data access. A Map/Reduce application or a web crawler application fits
			
 
				+   perfectly with this model. There is a plan to support appending-writes
			
 
				+   to files in the future.
			
 
				+
			
 
				+“Moving Computation is Cheaper than Moving Data”
			
 
				+
			
 
				+   A computation requested by an application is much more efficient if it
			
 
				+   is executed near the data it operates on. This is especially true when
			
 
				+   the size of the data set is huge. This minimizes network congestion and
			
 
				+   increases the overall throughput of the system. The assumption is that
			
 
				+   it is often better to migrate the computation closer to where the data
			
 
				+   is located rather than moving the data to where the application is
			
 
				+   running. HDFS provides interfaces for applications to move themselves
			
 
				+   closer to where the data is located.
			
 
				+
			
 
				+Portability Across Heterogeneous Hardware and Software Platforms
			
 
				+
			
 
				+   HDFS has been designed to be easily portable from one platform to
			
 
				+   another. This facilitates widespread adoption of HDFS as a platform of
			
 
				+   choice for a large set of applications.
			
 
				+
			
 
				+NameNode and DataNodes
			
 
				+
			
 
				+   HDFS has a master/slave architecture. An HDFS cluster consists of a
			
 
				+   single NameNode, a master server that manages the file system namespace
			
 
				+   and regulates access to files by clients. In addition, there are a
			
 
				+   number of DataNodes, usually one per node in the cluster, which manage
			
 
				+   storage attached to the nodes that they run on. HDFS exposes a file
			
 
				+   system namespace and allows user data to be stored in files.
			
 
				+   Internally, a file is split into one or more blocks and these blocks
			
 
				+   are stored in a set of DataNodes. The NameNode executes file system
			
 
				+   namespace operations like opening, closing, and renaming files and
			
 
				+   directories. It also determines the mapping of blocks to DataNodes. The
			
 
				+   DataNodes are responsible for serving read and write requests from the
			
 
				+   file system’s clients. The DataNodes also perform block creation,
			
 
				+   deletion, and replication upon instruction from the NameNode.
			
 
				+
			
 
				+
			
 
				+[images/hdfsarchitecture.png] HDFS Architecture
			
 
				+
			
 
				+   The NameNode and DataNode are pieces of software designed to run on
			
 
				+   commodity machines. These machines typically run a GNU/Linux operating
			
 
				+   system (OS). HDFS is built using the Java language; any machine that
			
 
				+   supports Java can run the NameNode or the DataNode software. Usage of
			
 
				+   the highly portable Java language means that HDFS can be deployed on a
			
 
				+   wide range of machines. A typical deployment has a dedicated machine
			
 
				+   that runs only the NameNode software. Each of the other machines in the
			
 
				+   cluster runs one instance of the DataNode software. The architecture
			
 
				+   does not preclude running multiple DataNodes on the same machine but in
			
 
				+   a real deployment that is rarely the case.
			
 
				+
			
 
				+   The existence of a single NameNode in a cluster greatly simplifies the
			
 
				+   architecture of the system. The NameNode is the arbitrator and
			
 
				+   repository for all HDFS metadata. The system is designed in such a way
			
 
				+   that user data never flows through the NameNode.
			
 
				+
			
 
				+The File System Namespace
			
 
				+
			
 
				+   HDFS supports a traditional hierarchical file organization. A user or
			
 
				+   an application can create directories and store files inside these
			
 
				+   directories. The file system namespace hierarchy is similar to most
			
 
				+   other existing file systems; one can create and remove files, move a
			
 
				+   file from one directory to another, or rename a file. HDFS does not yet
			
 
				+   implement user quotas or access permissions. HDFS does not support hard
			
 
				+   links or soft links. However, the HDFS architecture does not preclude
			
 
				+   implementing these features.
			
 
				+
			
 
				+   The NameNode maintains the file system namespace. Any change to the
			
 
				+   file system namespace or its properties is recorded by the NameNode. An
			
 
				+   application can specify the number of replicas of a file that should be
			
 
				+   maintained by HDFS. The number of copies of a file is called the
			
 
				+   replication factor of that file. This information is stored by the
			
 
				+   NameNode.
			
 
				+
			
 
				+Data Replication
			
 
				+
			
 
				+   HDFS is designed to reliably store very large files across machines in
			
 
				+   a large cluster. It stores each file as a sequence of blocks; all
			
 
				+   blocks in a file except the last block are the same size. The blocks of
			
 
				+   a file are replicated for fault tolerance. The block size and
			
 
				+   replication factor are configurable per file. An application can
			
 
				+   specify the number of replicas of a file. The replication factor can be
			
 
				+   specified at file creation time and can be changed later. Files in HDFS
			
 
				+   are write-once and have strictly one writer at any time.
			
 
				+
			
 
				+   The NameNode makes all decisions regarding replication of blocks. It
			
 
				+   periodically receives a Heartbeat and a Blockreport from each of the
			
 
				+   DataNodes in the cluster. Receipt of a Heartbeat implies that the
			
 
				+   DataNode is functioning properly. A Blockreport contains a list of all
			
 
				+   blocks on a DataNode.
			
 
				+
			
 
				+[images/hdfsdatanodes.png] HDFS DataNodes
			
 
				+
			
 
				+Replica Placement: The First Baby Steps
			
 
				+
			
 
				+   The placement of replicas is critical to HDFS reliability and
			
 
				+   performance. Optimizing replica placement distinguishes HDFS from most
			
 
				+   other distributed file systems. This is a feature that needs lots of
			
 
				+   tuning and experience. The purpose of a rack-aware replica placement
			
 
				+   policy is to improve data reliability, availability, and network
			
 
				+   bandwidth utilization. The current implementation for the replica
			
 
				+   placement policy is a first effort in this direction. The short-term
			
 
				+   goals of implementing this policy are to validate it on production
			
 
				+   systems, learn more about its behavior, and build a foundation to test
			
 
				+   and research more sophisticated policies.
			
 
				+
			
 
				+   Large HDFS instances run on a cluster of computers that commonly spread
			
 
				+   across many racks. Communication between two nodes in different racks
			
 
				+   has to go through switches. In most cases, network bandwidth between
			
 
				+   machines in the same rack is greater than network bandwidth between
			
 
				+   machines in different racks.
			
 
				+
			
 
				+   The NameNode determines the rack id each DataNode belongs to via the
			
 
				+   process outlined in {{{../hadoop-common/ClusterSetup.html#Hadoop+Rack+Awareness}Hadoop Rack Awareness}}. A simple but non-optimal policy
			
 
				+   is to place replicas on unique racks. This prevents losing data when an
			
 
				+   entire rack fails and allows use of bandwidth from multiple racks when
			
 
				+   reading data. This policy evenly distributes replicas in the cluster
			
 
				+   which makes it easy to balance load on component failure. However, this
			
 
				+   policy increases the cost of writes because a write needs to transfer
			
 
				+   blocks to multiple racks.
			
 
				+
			
 
				+   For the common case, when the replication factor is three, HDFS’s
			
 
				+   placement policy is to put one replica on one node in the local rack,
			
 
				+   another on a different node in the local rack, and the last on a
			
 
				+   different node in a different rack. This policy cuts the inter-rack
			
 
				+   write traffic which generally improves write performance. The chance of
			
 
				+   rack failure is far less than that of node failure; this policy does
			
 
				+   not impact data reliability and availability guarantees. However, it
			
 
				+   does reduce the aggregate network bandwidth used when reading data
			
 
				+   since a block is placed in only two unique racks rather than three.
			
 
				+   With this policy, the replicas of a file do not evenly distribute
			
 
				+   across the racks. One third of replicas are on one node, two thirds of
			
 
				+   replicas are on one rack, and the other third are evenly distributed
			
 
				+   across the remaining racks. This policy improves write performance
			
 
				+   without compromising data reliability or read performance.
			
 
				+
			
 
				+   The current, default replica placement policy described here is a work
			
 
				+   in progress.
			
 
				+
			
 
				+Replica Selection
			
 
				+
			
 
				+   To minimize global bandwidth consumption and read latency, HDFS tries
			
 
				+   to satisfy a read request from a replica that is closest to the reader.
			
 
				+   If there exists a replica on the same rack as the reader node, then
			
 
				+   that replica is preferred to satisfy the read request. If angg/ HDFS
			
 
				+   cluster spans multiple data centers, then a replica that is resident in
			
 
				+   the local data center is preferred over any remote replica.
			
 
				+
			
 
				+Safemode
			
 
				+
			
 
				+   On startup, the NameNode enters a special state called Safemode.
			
 
				+   Replication of data blocks does not occur when the NameNode is in the
			
 
				+   Safemode state. The NameNode receives Heartbeat and Blockreport
			
 
				+   messages from the DataNodes. A Blockreport contains the list of data
			
 
				+   blocks that a DataNode is hosting. Each block has a specified minimum
			
 
				+   number of replicas. A block is considered safely replicated when the
			
 
				+   minimum number of replicas of that data block has checked in with the
			
 
				+   NameNode. After a configurable percentage of safely replicated data
			
 
				+   blocks checks in with the NameNode (plus an additional 30 seconds), the
			
 
				+   NameNode exits the Safemode state. It then determines the list of data
			
 
				+   blocks (if any) that still have fewer than the specified number of
			
 
				+   replicas. The NameNode then replicates these blocks to other DataNodes.
			
 
				+
			
 
				+The Persistence of File System Metadata
			
 
				+
			
 
				+   The HDFS namespace is stored by the NameNode. The NameNode uses a
			
 
				+   transaction log called the EditLog to persistently record every change
			
 
				+   that occurs to file system metadata. For example, creating a new file
			
 
				+   in HDFS causes the NameNode to insert a record into the EditLog
			
 
				+   indicating this. Similarly, changing the replication factor of a file
			
 
				+   causes a new record to be inserted into the EditLog. The NameNode uses
			
 
				+   a file in its local host OS file system to store the EditLog. The
			
 
				+   entire file system namespace, including the mapping of blocks to files
			
 
				+   and file system properties, is stored in a file called the FsImage. The
			
 
				+   FsImage is stored as a file in the NameNode’s local file system too.
			
 
				+
			
 
				+   The NameNode keeps an image of the entire file system namespace and
			
 
				+   file Blockmap in memory. This key metadata item is designed to be
			
 
				+   compact, such that a NameNode with 4 GB of RAM is plenty to support a
			
 
				+   huge number of files and directories. When the NameNode starts up, it
			
 
				+   reads the FsImage and EditLog from disk, applies all the transactions
			
 
				+   from the EditLog to the in-memory representation of the FsImage, and
			
 
				+   flushes out this new version into a new FsImage on disk. It can then
			
 
				+   truncate the old EditLog because its transactions have been applied to
			
 
				+   the persistent FsImage. This process is called a checkpoint. In the
			
 
				+   current implementation, a checkpoint only occurs when the NameNode
			
 
				+   starts up. Work is in progress to support periodic checkpointing in the
			
 
				+   near future.
			
 
				+
			
 
				+   The DataNode stores HDFS data in files in its local file system. The
			
 
				+   DataNode has no knowledge about HDFS files. It stores each block of
			
 
				+   HDFS data in a separate file in its local file system. The DataNode
			
 
				+   does not create all files in the same directory. Instead, it uses a
			
 
				+   heuristic to determine the optimal number of files per directory and
			
 
				+   creates subdirectories appropriately. It is not optimal to create all
			
 
				+   local files in the same directory because the local file system might
			
 
				+   not be able to efficiently support a huge number of files in a single
			
 
				+   directory. When a DataNode starts up, it scans through its local file
			
 
				+   system, generates a list of all HDFS data blocks that correspond to
			
 
				+   each of these local files and sends this report to the NameNode: this
			
 
				+   is the Blockreport.
			
 
				+
			
 
				+The Communication Protocols
			
 
				+
			
 
				+   All HDFS communication protocols are layered on top of the TCP/IP
			
 
				+   protocol. A client establishes a connection to a configurable TCP port
			
 
				+   on the NameNode machine. It talks the ClientProtocol with the NameNode.
			
 
				+   The DataNodes talk to the NameNode using the DataNode Protocol. A
			
 
				+   Remote Procedure Call (RPC) abstraction wraps both the Client Protocol
			
 
				+   and the DataNode Protocol. By design, the NameNode never initiates any
			
 
				+   RPCs. Instead, it only responds to RPC requests issued by DataNodes or
			
 
				+   clients.
			
 
				+
			
 
				+Robustness
			
 
				+
			
 
				+   The primary objective of HDFS is to store data reliably even in the
			
 
				+   presence of failures. The three common types of failures are NameNode
			
 
				+   failures, DataNode failures and network partitions.
			
 
				+
			
 
				+Data Disk Failure, Heartbeats and Re-Replication
			
 
				+
			
 
				+   Each DataNode sends a Heartbeat message to the NameNode periodically. A
			
 
				+   network partition can cause a subset of DataNodes to lose connectivity
			
 
				+   with the NameNode. The NameNode detects this condition by the absence
			
 
				+   of a Heartbeat message. The NameNode marks DataNodes without recent
			
 
				+   Heartbeats as dead and does not forward any new IO requests to them.
			
 
				+   Any data that was registered to a dead DataNode is not available to
			
 
				+   HDFS any more. DataNode death may cause the replication factor of some
			
 
				+   blocks to fall below their specified value. The NameNode constantly
			
 
				+   tracks which blocks need to be replicated and initiates replication
			
 
				+   whenever necessary. The necessity for re-replication may arise due to
			
 
				+   many reasons: a DataNode may become unavailable, a replica may become
			
 
				+   corrupted, a hard disk on a DataNode may fail, or the replication
			
 
				+   factor of a file may be increased.
			
 
				+
			
 
				+Cluster Rebalancing
			
 
				+
			
 
				+   The HDFS architecture is compatible with data rebalancing schemes. A
			
 
				+   scheme might automatically move data from one DataNode to another if
			
 
				+   the free space on a DataNode falls below a certain threshold. In the
			
 
				+   event of a sudden high demand for a particular file, a scheme might
			
 
				+   dynamically create additional replicas and rebalance other data in the
			
 
				+   cluster. These types of data rebalancing schemes are not yet
			
 
				+   implemented.
			
 
				+
			
 
				+Data Integrity
			
 
				+
			
 
				+   It is possible that a block of data fetched from a DataNode arrives
			
 
				+   corrupted. This corruption can occur because of faults in a storage
			
 
				+   device, network faults, or buggy software. The HDFS client software
			
 
				+   implements checksum checking on the contents of HDFS files. When a
			
 
				+   client creates an HDFS file, it computes a checksum of each block of
			
 
				+   the file and stores these checksums in a separate hidden file in the
			
 
				+   same HDFS namespace. When a client retrieves file contents it verifies
			
 
				+   that the data it received from each DataNode matches the checksum
			
 
				+   stored in the associated checksum file. If not, then the client can opt
			
 
				+   to retrieve that block from another DataNode that has a replica of that
			
 
				+   block.
			
 
				+
			
 
				+Metadata Disk Failure
			
 
				+
			
 
				+   The FsImage and the EditLog are central data structures of HDFS. A
			
 
				+   corruption of these files can cause the HDFS instance to be
			
 
				+   non-functional. For this reason, the NameNode can be configured to
			
 
				+   support maintaining multiple copies of the FsImage and EditLog. Any
			
 
				+   update to either the FsImage or EditLog causes each of the FsImages and
			
 
				+   EditLogs to get updated synchronously. This synchronous updating of
			
 
				+   multiple copies of the FsImage and EditLog may degrade the rate of
			
 
				+   namespace transactions per second that a NameNode can support. However,
			
 
				+   this degradation is acceptable because even though HDFS applications
			
 
				+   are very data intensive in nature, they are not metadata intensive.
			
 
				+   When a NameNode restarts, it selects the latest consistent FsImage and
			
 
				+   EditLog to use.
			
 
				+
			
 
				+   The NameNode machine is a single point of failure for an HDFS cluster.
			
 
				+   If the NameNode machine fails, manual intervention is necessary.
			
 
				+   Currently, automatic restart and failover of the NameNode software to
			
 
				+   another machine is not supported.
			
 
				+
			
 
				+Snapshots
			
 
				+
			
 
				+   Snapshots support storing a copy of data at a particular instant of
			
 
				+   time. One usage of the snapshot feature may be to roll back a corrupted
			
 
				+   HDFS instance to a previously known good point in time. HDFS does not
			
 
				+   currently support snapshots but will in a future release.
			
 
				+
			
 
				+Data Organization
			
 
				+
			
 
				+Data Blocks
			
 
				+
			
 
				+   HDFS is designed to support very large files. Applications that are
			
 
				+   compatible with HDFS are those that deal with large data sets. These
			
 
				+   applications write their data only once but they read it one or more
			
 
				+   times and require these reads to be satisfied at streaming speeds. HDFS
			
 
				+   supports write-once-read-many semantics on files. A typical block size
			
 
				+   used by HDFS is 64 MB. Thus, an HDFS file is chopped up into 64 MB
			
 
				+   chunks, and if possible, each chunk will reside on a different
			
 
				+   DataNode.
			
 
				+
			
 
				+Staging
			
 
				+
			
 
				+   A client request to create a file does not reach the NameNode
			
 
				+   immediately. In fact, initially the HDFS client caches the file data
			
 
				+   into a temporary local file. Application writes are transparently
			
 
				+   redirected to this temporary local file. When the local file
			
 
				+   accumulates data worth over one HDFS block size, the client contacts
			
 
				+   the NameNode. The NameNode inserts the file name into the file system
			
 
				+   hierarchy and allocates a data block for it. The NameNode responds to
			
 
				+   the client request with the identity of the DataNode and the
			
 
				+   destination data block. Then the client flushes the block of data from
			
 
				+   the local temporary file to the specified DataNode. When a file is
			
 
				+   closed, the remaining un-flushed data in the temporary local file is
			
 
				+   transferred to the DataNode. The client then tells the NameNode that
			
 
				+   the file is closed. At this point, the NameNode commits the file
			
 
				+   creation operation into a persistent store. If the NameNode dies before
			
 
				+   the file is closed, the file is lost.
			
 
				+
			
 
				+   The above approach has been adopted after careful consideration of
			
 
				+   target applications that run on HDFS. These applications need streaming
			
 
				+   writes to files. If a client writes to a remote file directly without
			
 
				+   any client side buffering, the network speed and the congestion in the
			
 
				+   network impacts throughput considerably. This approach is not without
			
 
				+   precedent. Earlier distributed file systems, e.g. AFS, have used client
			
 
				+   side caching to improve performance. A POSIX requirement has been
			
 
				+   relaxed to achieve higher performance of data uploads.
			
 
				+
			
 
				+Replication Pipelining
			
 
				+
			
 
				+   When a client is writing data to an HDFS file, its data is first
			
 
				+   written to a local file as explained in the previous section. Suppose
			
 
				+   the HDFS file has a replication factor of three. When the local file
			
 
				+   accumulates a full block of user data, the client retrieves a list of
			
 
				+   DataNodes from the NameNode. This list contains the DataNodes that will
			
 
				+   host a replica of that block. The client then flushes the data block to
			
 
				+   the first DataNode. The first DataNode starts receiving the data in
			
 
				+   small portions (4 KB), writes each portion to its local repository and
			
 
				+   transfers that portion to the second DataNode in the list. The second
			
 
				+   DataNode, in turn starts receiving each portion of the data block,
			
 
				+   writes that portion to its repository and then flushes that portion to
			
 
				+   the third DataNode. Finally, the third DataNode writes the data to its
			
 
				+   local repository. Thus, a DataNode can be receiving data from the
			
 
				+   previous one in the pipeline and at the same time forwarding data to
			
 
				+   the next one in the pipeline. Thus, the data is pipelined from one
			
 
				+   DataNode to the next.
			
 
				+
			
 
				+Accessibility
			
 
				+
			
 
				+   HDFS can be accessed from applications in many different ways.
			
 
				+   Natively, HDFS provides a
			
 
				+   {{{http://hadoop.apache.org/docs/current/api/}FileSystem Java API}}
			
 
				+   for applications to use. A C language wrapper for this Java API is also
			
 
				+   available. In addition, an HTTP browser can also be used to browse the files
			
 
				+   of an HDFS instance. Work is in progress to expose HDFS through the WebDAV
			
 
				+   protocol.
			
 
				+
			
 
				+FS Shell
			
 
				+
			
 
				+   HDFS allows user data to be organized in the form of files and
			
 
				+   directories. It provides a commandline interface called FS shell that
			
 
				+   lets a user interact with the data in HDFS. The syntax of this command
			
 
				+   set is similar to other shells (e.g. bash, csh) that users are already
			
 
				+   familiar with. Here are some sample action/command pairs:
			
 
				+
			
 
				+*---------+---------+
			
 
				+|| Action | Command
			
 
				+*---------+---------+
			
 
				+| Create a directory named <<</foodir>>> | <<<bin/hadoop dfs -mkdir /foodir>>>
			
 
				+*---------+---------+
			
 
				+| Remove a directory named <<</foodir>>> | <<<bin/hadoop dfs -rmr /foodir>>>
			
 
				+*---------+---------+
			
 
				+| View the contents of a file named <<</foodir/myfile.txt>>> | <<<bin/hadoop dfs -cat /foodir/myfile.txt>>>
			
 
				+*---------+---------+
			
 
				+
			
 
				+   FS shell is targeted for applications that need a scripting language to
			
 
				+   interact with the stored data.
			
 
				+
			
 
				+DFSAdmin
			
 
				+
			
 
				+   The DFSAdmin command set is used for administering an HDFS cluster.
			
 
				+   These are commands that are used only by an HDFS administrator. Here
			
 
				+   are some sample action/command pairs:
			
 
				+
			
 
				+*---------+---------+
			
 
				+|| Action | Command
			
 
				+*---------+---------+
			
 
				+|Put the cluster in Safemode              | <<<bin/hadoop dfsadmin -safemode enter>>>
			
 
				+*---------+---------+
			
 
				+|Generate a list of DataNodes             | <<<bin/hadoop dfsadmin -report>>>
			
 
				+*---------+---------+
			
 
				+|Recommission or decommission DataNode(s) | <<<bin/hadoop dfsadmin -refreshNodes>>>
			
 
				+*---------+---------+
			
 
				+
			
 
				+Browser Interface
			
 
				+
			
 
				+   A typical HDFS install configures a web server to expose the HDFS
			
 
				+   namespace through a configurable TCP port. This allows a user to
			
 
				+   navigate the HDFS namespace and view the contents of its files using a
			
 
				+   web browser.
			
 
				+
			
 
				+Space Reclamation
			
 
				+
			
 
				+File Deletes and Undeletes
			
 
				+
			
 
				+   When a file is deleted by a user or an application, it is not
			
 
				+   immediately removed from HDFS. Instead, HDFS first renames it to a file
			
 
				+   in the <<</trash>>> directory. The file can be restored quickly as long as it
			
 
				+   remains in <<</trash>>>. A file remains in <<</trash>>> for a configurable amount
			
 
				+   of time. After the expiry of its life in <<</trash>>>, the NameNode deletes
			
 
				+   the file from the HDFS namespace. The deletion of a file causes the
			
 
				+   blocks associated with the file to be freed. Note that there could be
			
 
				+   an appreciable time delay between the time a file is deleted by a user
			
 
				+   and the time of the corresponding increase in free space in HDFS.
			
 
				+
			
 
				+   A user can Undelete a file after deleting it as long as it remains in
			
 
				+   the <<</trash>>> directory. If a user wants to undelete a file that he/she
			
 
				+   has deleted, he/she can navigate the <<</trash>>> directory and retrieve the
			
 
				+   file. The <<</trash>>> directory contains only the latest copy of the file
			
 
				+   that was deleted. The <<</trash>>> directory is just like any other directory
			
 
				+   with one special feature: HDFS applies specified policies to
			
 
				+   automatically delete files from this directory. The current default
			
 
				+   policy is to delete files from <<</trash>>> that are more than 6 hours old.
			
 
				+   In the future, this policy will be configurable through a well defined
			
 
				+   interface.
			
 
				+
			
 
				+Decrease Replication Factor
			
 
				+
			
 
				+   When the replication factor of a file is reduced, the NameNode selects
			
 
				+   excess replicas that can be deleted. The next Heartbeat transfers this
			
 
				+   information to the DataNode. The DataNode then removes the
			
 
				+   corresponding blocks and the corresponding free space appears in the
			
 
				+   cluster. Once again, there might be a time delay between the completion
			
 
				+   of the setReplication API call and the appearance of free space in the
			
 
				+   cluster.
			
 
				+
			
 
				+References
			
 
				+
			
 
				+   Hadoop {{{http://hadoop.apache.org/docs/current/api/}JavaDoc API}}.
			
 
				+
			
 
				+   HDFS source code: {{http://hadoop.apache.org/version_control.html}}
			
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfs-logo.jpg
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfs-logo.jpg
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsarchitecture.gif
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsarchitecture.gif
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsarchitecture.odg
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsarchitecture.odg
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsarchitecture.png
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsarchitecture.png
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsdatanodes.gif
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsdatanodes.gif
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsdatanodes.odg
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsdatanodes.odg
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsdatanodes.png
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsdatanodes.png
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsproxy-forward.jpg
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsproxy-forward.jpg
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsproxy-overview.jpg
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsproxy-overview.jpg
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsproxy-server.jpg
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/docs/src/documentation/resources/images/hdfsproxy-server.jpg
--- a/hadoop-project/src/site/site.xml
+++ b/hadoop-project/src/site/site.xml
@@ -51,6 +51,8 @@
 
				       <item name="Single Node Setup" href="hadoop-project-dist/hadoop-common/SingleCluster.html"/>
			
 
				       <item name="Cluster Setup" href="hadoop-project-dist/hadoop-common/ClusterSetup.html"/>
			
 
				       <item name="CLI Mini Cluster" href="hadoop-project-dist/hadoop-common/CLIMiniCluster.html"/>
			
 
				+      <item name="File System Shell" href="hadoop-project-dist/hadoop-common/FileSystemShell.html"/>
			
 
				+      <item name="Hadoop Commands Reference" href="hadoop-project-dist/hadoop-common/CommandsManual.html"/>
			
 
				     </menu>
			
 
				     
			
 
				     <menu name="HDFS" inherit="top">