11 lat temu · f874b20061
--- a/hadoop-common-project/hadoop-common/CHANGES.txt
+++ b/hadoop-common-project/hadoop-common/CHANGES.txt
@@ -301,6 +301,9 @@ Release 2.4.0 - UNRELEASED
 
				 
			
 
				   IMPROVEMENTS
			
 
				 
			
 
				+    HADOOP-10139. Update and improve the Single Cluster Setup document.
			
 
				+    (Akira Ajisaka via Arpit Agarwal)
			
 
				+
			
 
				   OPTIMIZATIONS
			
 
				 
			
 
				   BUG FIXES
			
--- a/hadoop-common-project/hadoop-common/src/site/apt/SingleCluster.apt.vm
+++ b/hadoop-common-project/hadoop-common/src/site/apt/SingleCluster.apt.vm
@@ -20,174 +20,267 @@ Hadoop MapReduce Next Generation - Setting up a Single Node Cluster.
 
				 
			
 
				 %{toc|section=1|fromDepth=0}
			
 
				 
			
 
				-* Mapreduce Tarball
			
 
				+* Purpose
			
 
				 
			
 
				-  You should be able to obtain the MapReduce tarball from the release.
			
 
				-  If not, you should be able to create a tarball from the source.
			
 
				+  This document describes how to set up and configure a single-node Hadoop
			
 
				+  installation so that you can quickly perform simple operations using Hadoop
			
 
				+  MapReduce and the Hadoop Distributed File System (HDFS).
			
 
				 
			
 
				-+---+
			
 
				-$ mvn clean install -DskipTests
			
 
				-$ cd hadoop-mapreduce-project
			
 
				-$ mvn clean install assembly:assembly -Pnative
			
 
				-+---+
			
 
				-  <<NOTE:>> You will need {{{http://code.google.com/p/protobuf}protoc 2.5.0}}
			
 
				-            installed.
			
 
				+* Prerequisites
			
 
				 
			
 
				-  To ignore the native builds in mapreduce you can omit the <<<-Pnative>>> argument
			
 
				-  for maven. The tarball should be available in <<<target/>>> directory. 
			
 
				+** Supported Platforms
			
 
				 
			
 
				-  
			
 
				-* Setting up the environment.
			
 
				+   * GNU/Linux is supported as a development and production platform.
			
 
				+     Hadoop has been demonstrated on GNU/Linux clusters with 2000 nodes.
			
 
				 
			
 
				-  Assuming you have installed hadoop-common/hadoop-hdfs and exported
			
 
				-  <<$HADOOP_COMMON_HOME>>/<<$HADOOP_HDFS_HOME>>, untar hadoop mapreduce 
			
 
				-  tarball and set environment variable <<$HADOOP_MAPRED_HOME>> to the 
			
 
				-  untarred directory. Set <<$HADOOP_YARN_HOME>> the same as <<$HADOOP_MAPRED_HOME>>. 
			
 
				- 
			
 
				-  <<NOTE:>> The following instructions assume you have hdfs running.
			
 
				+   * Windows is also a supported platform but the followings steps
			
 
				+     are for Linux only. To set up Hadoop on Windows, see
			
 
				+     {{{http://wiki.apache.org/hadoop/Hadoop2OnWindows}wiki page}}.
			
 
				 
			
 
				-* Setting up Configuration.
			
 
				+** Required Software
			
 
				 
			
 
				-  To start the ResourceManager and NodeManager, you will have to update the configs.
			
 
				-  Assuming your $HADOOP_CONF_DIR is the configuration directory and has the installed
			
 
				-  configs for HDFS and <<<core-site.xml>>>. There are 2 config files you will have to setup
			
 
				-  <<<mapred-site.xml>>> and <<<yarn-site.xml>>>.
			
 
				+   Required software for Linux include:
			
 
				 
			
 
				-** Setting up <<<mapred-site.xml>>>
			
 
				+   [[1]] Java\u2122 must be installed. Recommended Java versions are described
			
 
				+         at {{{http://wiki.apache.org/hadoop/HadoopJavaVersions}
			
 
				+         HadoopJavaVersions}}.
			
 
				 
			
 
				-  Add the following configs to your <<<mapred-site.xml>>>.
			
 
				+   [[2]] ssh must be installed and sshd must be running to use the Hadoop
			
 
				+         scripts that manage remote Hadoop daemons.
			
 
				 
			
 
				-+---+
			
 
				-  <property>
			
 
				-    <name>mapreduce.cluster.temp.dir</name>
			
 
				-    <value></value>
			
 
				-    <description>No description</description>
			
 
				-    <final>true</final>
			
 
				-  </property>
			
 
				-
			
 
				-  <property>
			
 
				-    <name>mapreduce.cluster.local.dir</name>
			
 
				-    <value></value>
			
 
				-    <description>No description</description>
			
 
				-    <final>true</final>
			
 
				-  </property>
			
 
				-+---+
			
 
				+** Installing Software
			
 
				 
			
 
				-** Setting up <<<yarn-site.xml>>>
			
 
				+  If your cluster doesn't have the requisite software you will need to install
			
 
				+  it.
			
 
				 
			
 
				-Add the following configs to your <<<yarn-site.xml>>>
			
 
				+  For example on Ubuntu Linux:
			
 
				 
			
 
				-+---+
			
 
				-  <property>
			
 
				-    <name>yarn.resourcemanager.resource-tracker.address</name>
			
 
				-    <value>host:port</value>
			
 
				-    <description>host is the hostname of the resource manager and 
			
 
				-    port is the port on which the NodeManagers contact the Resource Manager.
			
 
				-    </description>
			
 
				-  </property>
			
 
				-
			
 
				-  <property>
			
 
				-    <name>yarn.resourcemanager.scheduler.address</name>
			
 
				-    <value>host:port</value>
			
 
				-    <description>host is the hostname of the resourcemanager and port is the port
			
 
				-    on which the Applications in the cluster talk to the Resource Manager.
			
 
				-    </description>
			
 
				-  </property>
			
 
				-
			
 
				-  <property>
			
 
				-    <name>yarn.resourcemanager.scheduler.class</name>
			
 
				-    <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
			
 
				-    <description>In case you do not want to use the default scheduler</description>
			
 
				-  </property>
			
 
				-
			
 
				-  <property>
			
 
				-    <name>yarn.resourcemanager.address</name>
			
 
				-    <value>host:port</value>
			
 
				-    <description>the host is the hostname of the ResourceManager and the port is the port on
			
 
				-    which the clients can talk to the Resource Manager. </description>
			
 
				-  </property>
			
 
				-
			
 
				-  <property>
			
 
				-    <name>yarn.nodemanager.local-dirs</name>
			
 
				-    <value></value>
			
 
				-    <description>the local directories used by the nodemanager</description>
			
 
				-  </property>
			
 
				-
			
 
				-  <property>
			
 
				-    <name>yarn.nodemanager.address</name>
			
 
				-    <value>0.0.0.0:port</value>
			
 
				-    <description>the nodemanagers bind to this port</description>
			
 
				-  </property>  
			
 
				-
			
 
				-  <property>
			
 
				-    <name>yarn.nodemanager.resource.memory-mb</name>
			
 
				-    <value>10240</value>
			
 
				-    <description>the amount of memory on the NodeManager in GB</description>
			
 
				-  </property>
			
 
				- 
			
 
				-  <property>
			
 
				-    <name>yarn.nodemanager.remote-app-log-dir</name>
			
 
				-    <value>/app-logs</value>
			
 
				-    <description>directory on hdfs where the application logs are moved to </description>
			
 
				-  </property>
			
 
				-
			
 
				-   <property>
			
 
				-    <name>yarn.nodemanager.log-dirs</name>
			
 
				-    <value></value>
			
 
				-    <description>the directories used by Nodemanagers as log directories</description>
			
 
				-  </property>
			
 
				-
			
 
				-  <property>
			
 
				-    <name>yarn.nodemanager.aux-services</name>
			
 
				-    <value>mapreduce_shuffle</value>
			
 
				-    <description>shuffle service that needs to be set for Map Reduce to run </description>
			
 
				-  </property>
			
 
				-+---+
			
 
				+----
			
 
				+  $ sudo apt-get install ssh
			
 
				+  $ sudo apt-get install rsync
			
 
				+----
			
 
				+
			
 
				+* Download
			
 
				+
			
 
				+  To get a Hadoop distribution, download a recent stable release from one of
			
 
				+  the {{{http://www.apache.org/dyn/closer.cgi/hadoop/common/}
			
 
				+  Apache Download Mirrors}}.
			
 
				+
			
 
				+* Prepare to Start the Hadoop Cluster
			
 
				+
			
 
				+  Unpack the downloaded Hadoop distribution. In the distribution, edit
			
 
				+  the file <<<etc/hadoop/hadoop-env.sh>>> to define some parameters as
			
 
				+  follows:
			
 
				+
			
 
				+----
			
 
				+  # set to the root of your Java installation
			
 
				+  export JAVA_HOME=/usr/java/latest
			
 
				+
			
 
				+  # Assuming your installation directory is /usr/local/hadoop
			
 
				+  export HADOOP_PREFIX=/usr/local/hadoop
			
 
				+----
			
 
				+
			
 
				+  Try the following command:
			
 
				+
			
 
				+----
			
 
				+  $ bin/hadoop
			
 
				+----
			
 
				+
			
 
				+  This will display the usage documentation for the hadoop script.
			
 
				+
			
 
				+  Now you are ready to start your Hadoop cluster in one of the three supported
			
 
				+  modes:
			
 
				+
			
 
				+   * {{{Standalone Operation}Local (Standalone) Mode}}
			
 
				+
			
 
				+   * {{{Pseudo-Distributed Operation}Pseudo-Distributed Mode}}
			
 
				 
			
 
				-* Setting up <<<capacity-scheduler.xml>>>
			
 
				+   * {{{Fully-Distributed Operation}Fully-Distributed Mode}}
			
 
				 
			
 
				-   Make sure you populate the root queues in <<<capacity-scheduler.xml>>>.
			
 
				+* Standalone Operation
			
 
				+
			
 
				+  By default, Hadoop is configured to run in a non-distributed mode, as a
			
 
				+  single Java process. This is useful for debugging.
			
 
				+
			
 
				+  The following example copies the unpacked conf directory to use as input
			
 
				+  and then finds and displays every match of the given regular expression.
			
 
				+  Output is written to the given output directory.
			
 
				+
			
 
				+----
			
 
				+  $ mkdir input
			
 
				+  $ cp etc/hadoop/*.xml input
			
 
				+  $ bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-${project.version}.jar grep input output 'dfs[a-z.]+'
			
 
				+  $ cat output/*
			
 
				+----
			
 
				+
			
 
				+* Pseudo-Distributed Operation
			
 
				+
			
 
				+  Hadoop can also be run on a single-node in a pseudo-distributed mode where
			
 
				+  each Hadoop daemon runs in a separate Java process.
			
 
				+
			
 
				+** Configuration
			
 
				+
			
 
				+  Use the following:
			
 
				+
			
 
				+  etc/hadoop/core-site.xml:
			
 
				 
			
 
				 +---+
			
 
				-  <property>
			
 
				-    <name>yarn.scheduler.capacity.root.queues</name>
			
 
				-    <value>unfunded,default</value>
			
 
				-  </property>
			
 
				-  
			
 
				-  <property>
			
 
				-    <name>yarn.scheduler.capacity.root.capacity</name>
			
 
				-    <value>100</value>
			
 
				-  </property>
			
 
				-  
			
 
				-  <property>
			
 
				-    <name>yarn.scheduler.capacity.root.unfunded.capacity</name>
			
 
				-    <value>50</value>
			
 
				-  </property>
			
 
				+<configuration>
			
 
				+    <property>
			
 
				+        <name>fs.defaultFS</name>
			
 
				+        <value>hdfs://localhost:9000</value>
			
 
				+    </property>
			
 
				+</configuration>
			
 
				++---+
			
 
				+
			
 
				+  etc/hadoop/hdfs-site.xml:
			
 
				   
			
 
				-  <property>
			
 
				-    <name>yarn.scheduler.capacity.root.default.capacity</name>
			
 
				-    <value>50</value>
			
 
				-  </property>
			
 
				++---+
			
 
				+<configuration>
			
 
				+    <property>
			
 
				+        <name>dfs.replication</name>
			
 
				+        <value>1</value>
			
 
				+    </property>
			
 
				+</configuration>
			
 
				 +---+
			
 
				 
			
 
				-* Running daemons.
			
 
				+** Setup passphraseless ssh
			
 
				+
			
 
				+  Now check that you can ssh to the localhost without a passphrase:
			
 
				+
			
 
				+----
			
 
				+  $ ssh localhost
			
 
				+----
			
 
				+
			
 
				+  If you cannot ssh to localhost without a passphrase, execute the
			
 
				+  following commands:
			
 
				+
			
 
				+----
			
 
				+  $ ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa
			
 
				+  $ cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys
			
 
				+----
			
 
				+
			
 
				+** Execution
			
 
				+
			
 
				+  The following instructions are to run a MapReduce job locally.
			
 
				+  If you want to execute a job on YARN, see {{YARN on Single Node}}.
			
 
				+
			
 
				+  [[1]] Format the filesystem:
			
 
				+
			
 
				+----
			
 
				+  $ bin/hdfs namenode -format
			
 
				+----
			
 
				+
			
 
				+  [[2]] Start NameNode daemon and DataNode daemon:
			
 
				+
			
 
				+----
			
 
				+  $ sbin/start-dfs.sh
			
 
				+----
			
 
				+
			
 
				+        The hadoop daemon log output is written to the <<<${HADOOP_LOG_DIR}>>>
			
 
				+        directory (defaults to <<<${HADOOP_HOME}/logs>>>).
			
 
				+
			
 
				+  [[3]] Browse the web interface for the NameNode; by default it is
			
 
				+        available at:
			
 
				+
			
 
				+        * NameNode - <<<http://localhost:50070/>>>
			
 
				+
			
 
				+  [[4]] Make the HDFS directories required to execute MapReduce jobs:
			
 
				+
			
 
				+----
			
 
				+  $ bin/hdfs dfs -mkdir /user
			
 
				+  $ bin/hdfs dfs -mkdir /user/<username>
			
 
				+----
			
 
				+
			
 
				+  [[5]] Copy the input files into the distributed filesystem:
			
 
				+
			
 
				+----
			
 
				+  $ bin/hdfs dfs -put etc/hadoop input
			
 
				+----
			
 
				+
			
 
				+  [[6]] Run some of the examples provided:
			
 
				+
			
 
				+----
			
 
				+  $ bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-${project.version}.jar grep input output 'dfs[a-z.]+'
			
 
				+----
			
 
				+
			
 
				+  [[7]] Examine the output files:
			
 
				+
			
 
				+        Copy the output files from the distributed filesystem to the local
			
 
				+        filesystem and examine them:
			
 
				+
			
 
				+----
			
 
				+  $ bin/hdfs dfs -get output output
			
 
				+  $ cat output/*
			
 
				+----
			
 
				+
			
 
				+        or
			
 
				+
			
 
				+        View the output files on the distributed filesystem:
			
 
				+
			
 
				+----
			
 
				+  $ bin/hdfs dfs -cat output/*
			
 
				+----
			
 
				+
			
 
				+  [[8]] When you're done, stop the daemons with:
			
 
				+
			
 
				+----
			
 
				+  $ sbin/stop-dfs.sh
			
 
				+----
			
 
				+
			
 
				+** YARN on Single Node
			
 
				+
			
 
				+  You can run a MapReduce job on YARN in a pseudo-distributed mode by setting
			
 
				+  a few parameters and running ResourceManager daemon and NodeManager daemon
			
 
				+  in addition.
			
 
				+
			
 
				+  The following instructions assume that 1. ~ 4. steps of
			
 
				+  {{{Execution}the above instructions}} are already executed.
			
 
				+
			
 
				+  [[1]] Configure parameters as follows:
			
 
				+
			
 
				+        etc/hadoop/mapred-site.xml:
			
 
				 
			
 
				-  Assuming that the environment variables <<$HADOOP_COMMON_HOME>>, <<$HADOOP_HDFS_HOME>>, <<$HADOO_MAPRED_HOME>>,
			
 
				-  <<$HADOOP_YARN_HOME>>, <<$JAVA_HOME>> and <<$HADOOP_CONF_DIR>> have been set appropriately.
			
 
				-  Set $<<$YARN_CONF_DIR>> the same as $<<HADOOP_CONF_DIR>>
			
 
				- 
			
 
				-  Run ResourceManager and NodeManager as:
			
 
				-  
			
 
				 +---+
			
 
				-$ cd $HADOOP_MAPRED_HOME
			
 
				-$ sbin/yarn-daemon.sh start resourcemanager
			
 
				-$ sbin/yarn-daemon.sh start nodemanager
			
 
				+<configuration>
			
 
				+    <property>
			
 
				+        <name>mapreduce.framework.name</name>
			
 
				+        <value>yarn</value>
			
 
				+    </property>
			
 
				+</configuration>
			
 
				 +---+
			
 
				 
			
 
				-  You should be up and running. You can run randomwriter as:
			
 
				+        etc/hadoop/yarn-site.xml:
			
 
				 
			
 
				 +---+
			
 
				-$ $HADOOP_COMMON_HOME/bin/hadoop jar hadoop-examples.jar randomwriter out
			
 
				+<configuration>
			
 
				+    <property>
			
 
				+        <name>yarn.nodemanager.aux-services</name>
			
 
				+        <value>mapreduce_shuffle</value>
			
 
				+    </property>
			
 
				+</configuration>
			
 
				 +---+
			
 
				 
			
 
				-Good luck.
			
 
				+  [[2]] Start ResourceManager daemon and NodeManager daemon:
			
 
				+
			
 
				+----
			
 
				+  $ sbin/start-yarn.sh
			
 
				+----
			
 
				+
			
 
				+  [[3]] Browse the web interface for the ResourceManager; by default it is
			
 
				+        available at:
			
 
				+
			
 
				+        * ResourceManager - <<<http://localhost:8088/>>>
			
 
				+
			
 
				+  [[4]] Run a MapReduce job.
			
 
				+
			
 
				+  [[5]] When you're done, stop the daemons with:
			
 
				+
			
 
				+----
			
 
				+  $ sbin/stop-yarn.sh
			
 
				+----
			
 
				+
			
 
				+* Fully-Distributed Operation
			
 
				+
			
 
				+  For information on setting up fully-distributed, non-trivial clusters
			
 
				+  see {{{./ClusterSetup.html}Cluster Setup}}.