apache
/
hadoop
의 미러 https://github.com/apache/hadoop.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777
							<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta content="Apache Forrest" name="Generator">
<meta name="Forrest-version" content="0.8">
<meta name="Forrest-skin-name" content="pelt">
<title>Hadoop Cluster Setup</title>
<link type="text/css" href="skin/basic.css" rel="stylesheet">
<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
<link type="text/css" href="skin/profile.css" rel="stylesheet">
<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
<link rel="shortcut icon" href="images/favicon.ico">
</head>
<body onload="init()">
<script type="text/javascript">ndeSetTextSize();</script>
<div id="top">
<!--+
    |breadtrail
    +-->
<div class="breadtrail">
<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/core/">Core</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
    |header
    +-->
<div class="header">
<!--+
    |start group logo
    +-->
<div class="grouplogo">
<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
</div>
<!--+
    |end group logo
    +-->
<!--+
    |start Project Logo
    +-->
<div class="projectlogo">
<a href="http://hadoop.apache.org/core/"><img class="logoImage" alt="Hadoop" src="images/core-logo.gif" title="Scalable Computing Platform"></a>
</div>
<!--+
    |end Project Logo
    +-->
<!--+
    |start Search
    +-->
<div class="searchbox">
<form action="http://www.google.com/search" method="get" class="roundtopsmall">
<input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp; 
                    <input name="Search" value="Search" type="submit">
</form>
</div>
<!--+
    |end search
    +-->
<!--+
    |start Tabs
    +-->
<ul id="tabs">
<li>
<a class="unselected" href="http://hadoop.apache.org/core/">Project</a>
</li>
<li>
<a class="unselected" href="http://wiki.apache.org/hadoop">Wiki</a>
</li>
<li class="current">
<a class="selected" href="index.html">Hadoop 0.16 Documentation</a>
</li>
</ul>
<!--+
    |end Tabs
    +-->
</div>
</div>
<div id="main">
<div id="publishedStrip">
<!--+
    |start Subtabs
    +-->
<div id="level2tabs"></div>
<!--+
    |end Endtabs
    +-->
<script type="text/javascript"><!--
document.write("Last Published: " + document.lastModified);
//  --></script>
</div>
<!--+
    |breadtrail
    +-->
<div class="breadtrail">

             &nbsp;
           </div>
<!--+
    |start Menu, mainarea
    +-->
<!--+
    |start Menu
    +-->
<div id="menu">
<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Documentation</div>
<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
<div class="menuitem">
<a href="index.html">Overview</a>
</div>
<div class="menuitem">
<a href="quickstart.html">Quickstart</a>
</div>
<div class="menupage">
<div class="menupagetitle">Cluster Setup</div>
</div>
<div class="menuitem">
<a href="hdfs_design.html">HDFS Architecture</a>
</div>
<div class="menuitem">
<a href="hdfs_user_guide.html">HDFS User Guide</a>
</div>
<div class="menuitem">
<a href="hdfs_shell.html">HDFS Shell Guide</a>
</div>
<div class="menuitem">
<a href="hdfs_permissions_guide.html">HDFS Permissions Guide</a>
</div>
<div class="menuitem">
<a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
</div>
<div class="menuitem">
<a href="native_libraries.html">Native Hadoop Libraries</a>
</div>
<div class="menuitem">
<a href="streaming.html">Streaming</a>
</div>
<div class="menuitem">
<a href="hod.html">Hadoop On Demand</a>
</div>
<div class="menuitem">
<a href="api/index.html">API Docs</a>
</div>
<div class="menuitem">
<a href="http://wiki.apache.org/hadoop/">Wiki</a>
</div>
<div class="menuitem">
<a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a>
</div>
<div class="menuitem">
<a href="http://hadoop.apache.org/core/mailing_lists.html">Mailing Lists</a>
</div>
<div class="menuitem">
<a href="changes.html">Release Notes</a>
</div>
</div>
<div id="credit"></div>
<div id="roundbottom">
<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
<!--+
  |alternative credits
  +-->
<div id="credit2"></div>
</div>
<!--+
    |end Menu
    +-->
<!--+
    |start content
    +-->
<div id="content">
<div title="Portable Document Format" class="pdflink">
<a class="dida" href="cluster_setup.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
        PDF</a>
</div>
<h1>Hadoop Cluster Setup</h1>
<div id="minitoc-area">
<ul class="minitoc">
<li>
<a href="#Purpose">Purpose</a>
</li>
<li>
<a href="#Pre-requisites">Pre-requisites</a>
</li>
<li>
<a href="#Installation">Installation</a>
</li>
<li>
<a href="#Configuration">Configuration</a>
<ul class="minitoc">
<li>
<a href="#Configuration+Files">Configuration Files</a>
</li>
<li>
<a href="#Site+Configuration">Site Configuration</a>
<ul class="minitoc">
<li>
<a href="#Configuring+the+Environment+of+the+Hadoop+Daemons">Configuring the Environment of the Hadoop Daemons</a>
</li>
<li>
<a href="#Configuring+the+Hadoop+Daemons">Configuring the Hadoop Daemons</a>
</li>
<li>
<a href="#Slaves">Slaves</a>
</li>
<li>
<a href="#Logging">Logging</a>
</li>
</ul>
</li>
</ul>
</li>
<li>
<a href="#Hadoop+Rack+Awareness">Hadoop Rack Awareness</a>
</li>
<li>
<a href="#Hadoop+Startup">Hadoop Startup</a>
</li>
<li>
<a href="#Hadoop+Shutdown">Hadoop Shutdown</a>
</li>
</ul>
</div>
  
    
<a name="N1000D"></a><a name="Purpose"></a>
<h2 class="h3">Purpose</h2>
<div class="section">
<p>This document describes how to install, configure and manage non-trivial
      Hadoop clusters ranging from a few nodes to extremely large clusters with 
      thousands of nodes.</p>
<p>If you are looking to install Hadoop on a single machine to play
      with it, you can find relevant details <a href="quickstart.html">here</a>.
      </p>
</div>
    
    
<a name="N1001E"></a><a name="Pre-requisites"></a>
<h2 class="h3">Pre-requisites</h2>
<div class="section">
<ol>
        
<li>
          Make sure all <a href="quickstart.html#PreReqs">requisite</a> software 
          is installed on all nodes in your cluster.
        </li>
        
<li>
          
<a href="quickstart.html#Download">Get</a> the Hadoop software.
        </li>
      
</ol>
</div>
    
    
<a name="N10036"></a><a name="Installation"></a>
<h2 class="h3">Installation</h2>
<div class="section">
<p>Installing a Hadoop cluster typically involves unpacking the software 
      on all the machines in the cluster.</p>
<p>Typically one machine in the cluster is designated as the 
      <span class="codefrag">NameNode</span> and another machine the as <span class="codefrag">JobTracker</span>,
      exclusively. These are the <em>masters</em>. The rest of the machines in 
      the cluster act as both <span class="codefrag">DataNode</span> <em>and</em> 
      <span class="codefrag">TaskTracker</span>. These are the <em>slaves</em>.</p>
<p>The root of the distribution is referred to as 
      <span class="codefrag">HADOOP_HOME</span>. All machines in the cluster usually have the same 
      <span class="codefrag">HADOOP_HOME</span> path.</p>
</div>
    
    
<a name="N10061"></a><a name="Configuration"></a>
<h2 class="h3">Configuration</h2>
<div class="section">
<p>The following sections describe how to configure a Hadoop cluster.</p>
<a name="N1006A"></a><a name="Configuration+Files"></a>
<h3 class="h4">Configuration Files</h3>
<p>Hadoop configuration is driven by two important configuration files
        found in the <span class="codefrag">conf/</span> directory of the distribution:</p>
<ol>
          
<li>
            
<a href="http://hadoop.apache.org/core/docs/current/hadoop-default.html">hadoop-default.xml</a> - Read-only 
            default configuration.
          </li>
          
<li>
            
<em>hadoop-site.xml</em> - Site-specific configuration.
          </li>
        
</ol>
<p>To learn more about how the Hadoop framework is controlled by these 
        configuration files, look 
        <a href="api/org/apache/hadoop/conf/Configuration.html">here</a>.</p>
<p>Additionally, you can control the Hadoop scripts found in the 
        <span class="codefrag">bin/</span> directory of the distribution, by setting site-specific 
        values via the <span class="codefrag">conf/hadoop-env.sh</span>.</p>
<a name="N10097"></a><a name="Site+Configuration"></a>
<h3 class="h4">Site Configuration</h3>
<p>To configure the the Hadoop cluster you will need to configure the
        <em>environment</em> in which the Hadoop daemons execute as well as
        the <em>configuration parameters</em> for the Hadoop daemons.</p>
<p>The Hadoop daemons are <span class="codefrag">NameNode</span>/<span class="codefrag">DataNode</span> 
        and <span class="codefrag">JobTracker</span>/<span class="codefrag">TaskTracker</span>.</p>
<a name="N100B5"></a><a name="Configuring+the+Environment+of+the+Hadoop+Daemons"></a>
<h4>Configuring the Environment of the Hadoop Daemons</h4>
<p>Administrators should use the <span class="codefrag">conf/hadoop-env.sh</span> script
          to do site-specific customization of the Hadoop daemons' process 
          environment.</p>
<p>At the very least you should specify the
          <span class="codefrag">JAVA_HOME</span> so that it is correctly defined on each
          remote node.</p>
<p>Other useful configuration parameters that you can customize 
          include:</p>
<ul>
            
<li>
              
<span class="codefrag">HADOOP_LOG_DIR</span> - The directory where the daemons'
              log files are stored. They are automatically created if they don't
              exist.
            </li>
            
<li>
              
<span class="codefrag">HADOOP_HEAPSIZE</span> - The maximum amount of heapsize 
              to use, in MB e.g. <span class="codefrag">2000MB</span>.
            </li>
          
</ul>
<a name="N100DD"></a><a name="Configuring+the+Hadoop+Daemons"></a>
<h4>Configuring the Hadoop Daemons</h4>
<p>This section deals with important parameters to be specified in the
          <span class="codefrag">conf/hadoop-site.xml</span> for the Hadoop cluster.</p>
<table class="ForrestTable" cellspacing="1" cellpadding="4">
  		    
<tr>
		      
<th colspan="1" rowspan="1">Parameter</th>
		      <th colspan="1" rowspan="1">Value</th> 
		      <th colspan="1" rowspan="1">Notes</th>
		    
</tr>
  		    
<tr>
		      
<td colspan="1" rowspan="1">fs.default.name</td>
  		      <td colspan="1" rowspan="1">Hostname or IP address of <span class="codefrag">NameNode</span>.</td>
		      <td colspan="1" rowspan="1"><em>host:port</em> pair.</td>
		    
</tr>
		    
<tr>
		      
<td colspan="1" rowspan="1">mapred.job.tracker</td>
		      <td colspan="1" rowspan="1">Hostname or IP address of <span class="codefrag">JobTracker</span>.</td>
		      <td colspan="1" rowspan="1"><em>host:port</em> pair.</td>
		    
</tr>
		    
<tr>
		      
<td colspan="1" rowspan="1">dfs.name.dir</td>
		      <td colspan="1" rowspan="1">
		        Path on the local filesystem where the <span class="codefrag">NameNode</span> 
		        stores the namespace and transactions logs persistently.</td>
		      <td colspan="1" rowspan="1">
		        If this is a comma-delimited list of directories then the name 
		        table is replicated in all of the directories, for redundancy.
		      </td>
		    
</tr>
		    
<tr>
		      
<td colspan="1" rowspan="1">dfs.data.dir</td>
		      <td colspan="1" rowspan="1">
		        Comma separated list of paths on the local filesystem of a 
		        <span class="codefrag">DataNode</span> where it should store its blocks.
		      </td>
		      <td colspan="1" rowspan="1">
		        If this is a comma-delimited list of directories, then data will 
		        be stored in all named directories, typically on different 
		        devices.
		      </td>
		    
</tr>
		    
<tr>
		      
<td colspan="1" rowspan="1">mapred.system.dir</td>
		      <td colspan="1" rowspan="1">
		        Path on the HDFS where where the Map-Reduce framework stores 
		        system files e.g. <span class="codefrag">/hadoop/mapred/system/</span>.
		      </td>
		      <td colspan="1" rowspan="1">
		        This is in the default filesystem (HDFS) and must be accessible 
		        from both the server and client machines.
		      </td>
		    
</tr>
		    
<tr>
		      
<td colspan="1" rowspan="1">mapred.local.dir</td>
		      <td colspan="1" rowspan="1">
		        Comma-separated list of paths on the local filesystem where 
		        temporary Map-Reduce data is written.
		      </td>
		      <td colspan="1" rowspan="1">Multiple paths help spread disk i/o.</td>
		    
</tr>
		    
<tr>
		      
<td colspan="1" rowspan="1">mapred.tasktracker.{map|reduce}.tasks.maximum</td>
		      <td colspan="1" rowspan="1">
		        The maximum number of map/reduce tasks, which are run 
		        simultaneously on a given <span class="codefrag">TaskTracker</span>, individually.
		      </td>
		      <td colspan="1" rowspan="1">
		        Defaults to 2 (2 maps and 2 reduces), but vary it depending on 
		        your hardware.
		      </td>
		    
</tr>
		    
<tr>
		      
<td colspan="1" rowspan="1">dfs.hosts/dfs.hosts.exclude</td>
		      <td colspan="1" rowspan="1">List of permitted/excluded DataNodes.</td>
		      <td colspan="1" rowspan="1">
		        If necessary, use these files to control the list of allowable 
		        datanodes.
		      </td>
		    
</tr>
		    
<tr>
		      
<td colspan="1" rowspan="1">mapred.hosts/mapred.hosts.exclude</td>
		      <td colspan="1" rowspan="1">List of permitted/excluded TaskTrackers.</td>
		      <td colspan="1" rowspan="1">
		        If necessary, use these files to control the list of allowable 
		        tasktrackers.
		      </td>
  		    
</tr>
		  
</table>
<p>Typically all the above parameters are marked as 
          <a href="api/org/apache/hadoop/conf/Configuration.html#FinalParams">
          final</a> to ensure that they cannot be overriden by user-applications.
          </p>
<a name="N101BD"></a><a name="Real-World+Cluster+Configurations"></a>
<h5>Real-World Cluster Configurations</h5>
<p>This section lists some non-default configuration parameters which 
            have been used to run the <em>sort</em> benchmark on very large 
            clusters.</p>
<ul>
              
<li>
                
<p>Some non-default configuration values used to run sort900,
                that is 9TB of data sorted on a cluster with 900 nodes:</p>
                
<table class="ForrestTable" cellspacing="1" cellpadding="4">
  		          
<tr>
		            
<th colspan="1" rowspan="1">Parameter</th>
		            <th colspan="1" rowspan="1">Value</th> 
		            <th colspan="1" rowspan="1">Notes</th>
		          
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">dfs.block.size</td>
                    <td colspan="1" rowspan="1">134217728</td>
                    <td colspan="1" rowspan="1">HDFS blocksize of 128MB for large file-systems.</td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">dfs.namenode.handler.count</td>
                    <td colspan="1" rowspan="1">40</td>
                    <td colspan="1" rowspan="1">
                      More NameNode server threads to handle RPCs from large 
                      number of DataNodes.
                    </td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">mapred.reduce.parallel.copies</td>
                    <td colspan="1" rowspan="1">20</td>
                    <td colspan="1" rowspan="1">
                      Higher number of parallel copies run by reduces to fetch
                      outputs from very large number of maps.
                    </td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">mapred.child.java.opts</td>
                    <td colspan="1" rowspan="1">-Xmx512M</td>
                    <td colspan="1" rowspan="1">
                      Larger heap-size for child jvms of maps/reduces. Also controls the amount 
                      of virtual memory that a streaming/pipes task gets.
                    </td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">fs.inmemory.size.mb</td>
                    <td colspan="1" rowspan="1">200</td>
                    <td colspan="1" rowspan="1">
                      Larger amount of memory allocated for the in-memory 
                      file-system used to merge map-outputs at the reduces.
                    </td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">io.sort.factor</td>
                    <td colspan="1" rowspan="1">100</td>
                    <td colspan="1" rowspan="1">More streams merged at once while sorting files.</td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">io.sort.mb</td>
                    <td colspan="1" rowspan="1">200</td>
                    <td colspan="1" rowspan="1">Higher memory-limit while sorting data.</td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">io.file.buffer.size</td>
                    <td colspan="1" rowspan="1">131072</td>
                    <td colspan="1" rowspan="1">Size of read/write buffer used in SequenceFiles.</td>
                  
</tr>
                
</table>
              
</li>
              
<li>
                
<p>Updates to some configuration values to run sort1400 and 
                sort2000, that is 14TB of data sorted on 1400 nodes and 20TB of
                data sorted on 2000 nodes:</p>
                
<table class="ForrestTable" cellspacing="1" cellpadding="4">
  		          
<tr>
		            
<th colspan="1" rowspan="1">Parameter</th>
		            <th colspan="1" rowspan="1">Value</th> 
		            <th colspan="1" rowspan="1">Notes</th>
		          
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">mapred.job.tracker.handler.count</td>
                    <td colspan="1" rowspan="1">60</td>
                    <td colspan="1" rowspan="1">
                      More JobTracker server threads to handle RPCs from large 
                      number of TaskTrackers.
                    </td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">mapred.reduce.parallel.copies</td>
                    <td colspan="1" rowspan="1">50</td>
                    <td colspan="1" rowspan="1"></td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">tasktracker.http.threads</td>
                    <td colspan="1" rowspan="1">50</td>
                    <td colspan="1" rowspan="1">
                      More worker threads for the TaskTracker's http server. The
                      http server is used by reduces to fetch intermediate 
                      map-outputs.
                    </td>
                  
</tr>
                  
<tr>
                    
<td colspan="1" rowspan="1">mapred.child.java.opts</td>
                    <td colspan="1" rowspan="1">-Xmx1024M</td>
                    <td colspan="1" rowspan="1"></td>
                  
</tr>
                
</table>
              
</li>
            
</ul>
<a name="N102DA"></a><a name="Slaves"></a>
<h4>Slaves</h4>
<p>Typically you choose one machine in the cluster to act as the 
          <span class="codefrag">NameNode</span> and one machine as to act as the 
          <span class="codefrag">JobTracker</span>, exclusively. The rest of the machines act as 
          both a <span class="codefrag">DataNode</span> and <span class="codefrag">TaskTracker</span> and are 
          referred to as <em>slaves</em>.</p>
<p>List all slave hostnames or IP addresses in your 
          <span class="codefrag">conf/slaves</span> file, one per line.</p>
<a name="N102F9"></a><a name="Logging"></a>
<h4>Logging</h4>
<p>Hadoop uses the <a href="http://logging.apache.org/log4j/">Apache 
          log4j</a> via the <a href="http://commons.apache.org/logging/">Apache 
          Commons Logging</a> framework for logging. Edit the 
          <span class="codefrag">conf/log4j.properties</span> file to customize the Hadoop 
          daemons' logging configuration (log-formats and so on).</p>
<a name="N1030D"></a><a name="History+Logging"></a>
<h5>History Logging</h5>
<p> The job history files are stored in central location 
            <span class="codefrag"> hadoop.job.history.location </span> which can be on DFS also,
            whose default value is <span class="codefrag">${HADOOP_LOG_DIR}/history</span>. 
            Job history server is started on job tracker. The history 
            web UI is accessible from job tracker web UI.</p>
<p> The history files are also logged to user specified directory
            <span class="codefrag">hadoop.job.history.user.location</span> 
            which defaults to job output directory. The files are stored in
            "_logs/history/" in the specified directory. Hence, by default 
            they will be in "mapred.output.dir/_logs/history/". User can stop
            logging by giving the value <span class="codefrag">none</span> for 
            <span class="codefrag">hadoop.job.history.user.location</span> 
</p>
<p> User can view logs in specified directory using 
            the following command <br>
            
<span class="codefrag">$ bin/hadoop job -history output-dir</span>
<br>
            This will start a stand alone jetty on the client and 
            load history jsp's. 
            It will display the port where the server is up at. The server will
            be up for 30 minutes. User has to use 
            <span class="codefrag"> http://hostname:port </span> to view the history. User can 
            also provide http bind address using 
            <span class="codefrag">mapred.job.history.http.bindAddress</span>
</p>
<p>Once all the necessary configuration is complete, distribute the files
      to the <span class="codefrag">HADOOP_CONF_DIR</span> directory on all the machines, 
      typically <span class="codefrag">${HADOOP_HOME}/conf</span>.</p>
</div>
    
    
<a name="N10343"></a><a name="Hadoop+Rack+Awareness"></a>
<h2 class="h3">Hadoop Rack Awareness</h2>
<div class="section">
<p>The HDFS and the Map-Reduce components are rack-aware.</p>
<p>The <span class="codefrag">NameNode</span> and the <span class="codefrag">JobTracker</span> obtains the
      <span class="codefrag">rack id</span> of the slaves in the cluster by invoking an API 
      <a href="api/org/apache/hadoop/net/DNSToSwitchMapping.html#resolve(java.util.List)">resolve</a> in an administrator configured
      module. The API resolves the slave's DNS name (also IP address) to a 
      rack id. What module to use can be configured using the configuration
      item <span class="codefrag">topology.node.switch.mapping.impl</span>. The default 
      implementation of the same runs a script/command configured using 
      <span class="codefrag">topology.script.file.name</span>. If topology.script.file.name is
      not set, the rack id <span class="codefrag">/default-rack</span> is returned for any 
      passed IP address. The additional configuration in the Map-Reduce
      part is <span class="codefrag">mapred.cache.task.levels</span> which determines the number
      of levels (in the network topology) of caches. So, for example, if it is
      the default value of 2, two levels of caches will be constructed - 
      one for hosts (host -&gt; task mapping) and another for racks 
      (rack -&gt; task mapping).
      </p>
</div>
    
    
<a name="N10369"></a><a name="Hadoop+Startup"></a>
<h2 class="h3">Hadoop Startup</h2>
<div class="section">
<p>To start a Hadoop cluster you will need to start both the HDFS and 
      Map-Reduce cluster.</p>
<p>
        Format a new distributed filesystem:<br>
        
<span class="codefrag">$ bin/hadoop namenode -format</span>
      
</p>
<p>
        Start the HDFS with the following command, run on the designated
        <span class="codefrag">NameNode</span>:<br>
        
<span class="codefrag">$ bin/start-dfs.sh</span>
      
</p>
<p>The <span class="codefrag">bin/start-dfs.sh</span> script also consults the 
      <span class="codefrag">${HADOOP_CONF_DIR}/slaves</span> file on the <span class="codefrag">NameNode</span> 
      and starts the <span class="codefrag">DataNode</span> daemon on all the listed slaves.</p>
<p>
        Start Map-Reduce with the following command, run on the designated
        <span class="codefrag">JobTracker</span>:<br>
        
<span class="codefrag">$ bin/start-mapred.sh</span>
      
</p>
<p>The <span class="codefrag">bin/start-mapred.sh</span> script also consults the 
      <span class="codefrag">${HADOOP_CONF_DIR}/slaves</span> file on the <span class="codefrag">JobTracker</span> 
      and starts the <span class="codefrag">TaskTracker</span> daemon on all the listed slaves.
      </p>
</div>
    
    
<a name="N103AF"></a><a name="Hadoop+Shutdown"></a>
<h2 class="h3">Hadoop Shutdown</h2>
<div class="section">
<p>
        Stop HDFS with the following command, run on the designated 
        <span class="codefrag">NameNode</span>:<br>
        
<span class="codefrag">$ bin/stop-dfs.sh</span>
      
</p>
<p>The <span class="codefrag">bin/stop-dfs.sh</span> script also consults the 
      <span class="codefrag">${HADOOP_CONF_DIR}/slaves</span> file on the <span class="codefrag">NameNode</span> 
      and stops the <span class="codefrag">DataNode</span> daemon on all the listed slaves.</p>
<p>
        Stop Map-Reduce with the following command, run on the designated
        the designated <span class="codefrag">JobTracker</span>:<br>
        
<span class="codefrag">$ bin/stop-mapred.sh</span>
<br>
      
</p>
<p>The <span class="codefrag">bin/stop-mapred.sh</span> script also consults the 
      <span class="codefrag">${HADOOP_CONF_DIR}/slaves</span> file on the <span class="codefrag">JobTracker</span> 
      and stops the <span class="codefrag">TaskTracker</span> daemon on all the listed slaves.</p>
</div>
  
</div>
<!--+
    |end content
    +-->
<div class="clearboth">&nbsp;</div>
</div>
<div id="footer">
<!--+
    |start bottomstrip
    +-->
<div class="lastmodified">
<script type="text/javascript"><!--
document.write("Last Published: " + document.lastModified);
//  --></script>
</div>
<div class="copyright">
        Copyright &copy;
         2007 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
</div>
<!--+
    |end bottomstrip
    +-->
</div>
</body>
</html>