Browse Source

HADOOP-2984. Add forrest documentation for DistCp.


git-svn-id: https://svn.apache.org/repos/asf/hadoop/core/branches/branch-0.18@667705 13f79535-47bb-0310-9956-ffa450edef68
Christopher Douglas 17 years ago
parent
commit
2443e7ffb0

+ 2 - 0
CHANGES.txt

@@ -285,6 +285,8 @@ Release 0.18.0 - Unreleased
     HADOOP-3096. Improves documentation about the Task Execution Environment in 
     HADOOP-3096. Improves documentation about the Task Execution Environment in 
     the Map-Reduce tutorial. (Amareshwari Sriramadasu via ddas)
     the Map-Reduce tutorial. (Amareshwari Sriramadasu via ddas)
 
 
+    HADOOP-2984. Add forrest documentation for DistCp (cdouglas)
+
   OPTIMIZATIONS
   OPTIMIZATIONS
 
 
     HADOOP-3274. The default constructor of BytesWritable creates empty 
     HADOOP-3274. The default constructor of BytesWritable creates empty 

+ 3 - 0
docs/cluster_setup.html

@@ -129,6 +129,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 587 - 0
docs/distcp.html

@@ -0,0 +1,587 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.8">
+<meta name="Forrest-skin-name" content="pelt">
+<title>DistCp</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/core/">Core</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://hadoop.apache.org/core/"><img class="logoImage" alt="Hadoop" src="images/core-logo.gif" title="Scalable Computing Platform"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
+<input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li>
+<a class="unselected" href="http://hadoop.apache.org/core/">Project</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/hadoop">Wiki</a>
+</li>
+<li class="current">
+<a class="selected" href="index.html">Hadoop 0.18 Documentation</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Documentation</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
+<div class="menuitem">
+<a href="index.html">Overview</a>
+</div>
+<div class="menuitem">
+<a href="quickstart.html">Quickstart</a>
+</div>
+<div class="menuitem">
+<a href="cluster_setup.html">Cluster Setup</a>
+</div>
+<div class="menuitem">
+<a href="hdfs_design.html">HDFS Architecture</a>
+</div>
+<div class="menuitem">
+<a href="hdfs_user_guide.html">HDFS User Guide</a>
+</div>
+<div class="menuitem">
+<a href="hdfs_permissions_guide.html">HDFS Permissions Guide</a>
+</div>
+<div class="menuitem">
+<a href="hdfs_quota_admin_guide.html">HDFS Quotas Administrator Guide</a>
+</div>
+<div class="menuitem">
+<a href="hdfs_shell.html">FS Shell Guide</a>
+</div>
+<div class="menupage">
+<div class="menupagetitle">DistCp Guide</div>
+</div>
+<div class="menuitem">
+<a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
+</div>
+<div class="menuitem">
+<a href="native_libraries.html">Native Hadoop Libraries</a>
+</div>
+<div class="menuitem">
+<a href="streaming.html">Streaming</a>
+</div>
+<div class="menuitem">
+<a href="hadoop_archives.html">Hadoop Archives</a>
+</div>
+<div class="menuitem">
+<a href="hod.html">Hadoop On Demand</a>
+</div>
+<div class="menuitem">
+<a href="api/index.html">API Docs</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/hadoop/">Wiki</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="http://hadoop.apache.org/core/mailing_lists.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="releasenotes.html">Release Notes</a>
+</div>
+<div class="menuitem">
+<a href="changes.html">All Changes</a>
+</div>
+</div>
+<div id="credit"></div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="distcp.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>DistCp</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#Overview">Overview</a>
+</li>
+<li>
+<a href="#Usage">Usage</a>
+<ul class="minitoc">
+<li>
+<a href="#Basic">Basic</a>
+</li>
+<li>
+<a href="#options">Options</a>
+<ul class="minitoc">
+<li>
+<a href="#Option+Index">Option Index</a>
+</li>
+<li>
+<a href="#uo">Update and Overwrite</a>
+</li>
+</ul>
+</li>
+</ul>
+</li>
+<li>
+<a href="#etc">Appendix</a>
+<ul class="minitoc">
+<li>
+<a href="#Map+sizing">Map sizing</a>
+</li>
+<li>
+<a href="#cpver">Copying between versions of HDFS</a>
+</li>
+<li>
+<a href="#Map%2FReduce+and+other+side-effects">Map/Reduce and other side-effects</a>
+</li>
+</ul>
+</li>
+</ul>
+</div>
+
+    
+<a name="N1000D"></a><a name="Overview"></a>
+<h2 class="h3">Overview</h2>
+<div class="section">
+<p>DistCp (distributed copy) is a tool used for large inter/intra-cluster
+      copying. It uses map/reduce to effect its distribution, error
+      handling/recovery, and reporting. It expands a list of files and
+      directories into input to map tasks, each of which will copy a partition
+      of the files specified in the source list. Its map/reduce pedigree has
+      endowed it with some quirks in both its semantics and execution. The
+      purpose of this document is to offer guidance for common tasks and to
+      elucidate its model.</p>
+</div>
+
+    
+<a name="N10017"></a><a name="Usage"></a>
+<h2 class="h3">Usage</h2>
+<div class="section">
+<a name="N1001D"></a><a name="Basic"></a>
+<h3 class="h4">Basic</h3>
+<p>The most common invocation of DistCp is an inter-cluster copy:</p>
+<p>
+<span class="codefrag">bash$ hadoop distcp hdfs://nn1:8020/foo/bar \</span>
+<br>
+           
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 hdfs://nn2:8020/bar/foo</span>
+</p>
+<p>This will expand the namespace under <span class="codefrag">/foo/bar</span> on nn1
+        into a temporary file, partition its contents among a set of map
+        tasks, and start a copy on each TaskTracker from nn1 to nn2. Note
+        that DistCp expects absolute paths.</p>
+<p>One can also specify multiple source directories on the command
+        line:</p>
+<p>
+<span class="codefrag">bash$ hadoop distcp hdfs://nn1:8020/foo/a \</span>
+<br>
+           
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 hdfs://nn1:8020/foo/b \</span>
+<br>
+           
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 hdfs://nn2:8020/bar/foo</span>
+</p>
+<p>Or, equivalently, from a file using the <span class="codefrag">-f</span> option:<br>
+        
+<span class="codefrag">bash$ hadoop distcp -f hdfs://nn1:8020/srclist \</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              &nbsp;hdfs://nn2:8020/bar/foo</span>
+<br>
+</p>
+<p>Where <span class="codefrag">srclist</span> contains<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</span>
+</p>
+<p>When copying from multiple sources, DistCp will abort the copy with
+        an error message if two sources collide, but collisions at the
+        destination are resolved per the <a href="#options">options</a>
+        specified. By default, files already existing at the destination are
+        skipped (i.e. not replaced by the source file). A count of skipped
+        files is reported at the end of each job, but it may be inaccurate if a
+        copier failed for some subset of its files, but succeeded on a later
+        attempt (see <a href="#etc">Appendix</a>).</p>
+<p>It is important that each TaskTracker can reach and communicate with
+        both the source and destination filesystems. For hdfs, both the source
+        and destination must be running the same version of the protocol or use
+        a backwards-compatible protocol (see <a href="#cpver">Copying Between
+        Versions</a>).</p>
+<p>After a copy, it is recommended that one generates and cross-checks
+        a listing of the source and destination to verify that the copy was
+        truly successful. Since DistCp employs both map/reduce and the
+        FileSystem API, issues in or between any of the three could adversely
+        and silently affect the copy. Some have had success running with
+        <span class="codefrag">-update</span> enabled to perform a second pass, but users should
+        be acquainted with its semantics before attempting this.</p>
+<p>It's also worth noting that if another client is still writing to a
+        source file, the copy will likely fail. Attempting to overwrite a file
+        being written at the destination should also fail on HDFS. If a source
+        file is (re)moved before it is copied, the copy will fail with a
+        FileNotFoundException.</p>
+<a name="N1007E"></a><a name="options"></a>
+<h3 class="h4">Options</h3>
+<a name="N10084"></a><a name="Option+Index"></a>
+<h4>Option Index</h4>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+          
+<tr>
+<th colspan="1" rowspan="1"> Flag </th><th colspan="1" rowspan="1"> Description </th><th colspan="1" rowspan="1"> Notes </th>
+</tr>
+
+          
+<tr>
+<td colspan="1" rowspan="1"><span class="codefrag">-p[rbugp]</span></td>
+              <td colspan="1" rowspan="1">Preserve<br>
+                  &nbsp;&nbsp;r: replication number<br>
+                  &nbsp;&nbsp;b: block size<br>
+                  &nbsp;&nbsp;u: user<br>
+                  &nbsp;&nbsp;g: group<br>
+                  &nbsp;&nbsp;p: permission<br>
+</td>
+              <td colspan="1" rowspan="1">Modification times are not preserved. Also, when
+              <span class="codefrag">-update</span> is specified, status updates will
+              <strong>not</strong> be synchronized unless the file sizes
+              also differ (i.e. unless the file is re-created).
+              </td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1"><span class="codefrag">-i</span></td>
+              <td colspan="1" rowspan="1">Ignore failures</td>
+              <td colspan="1" rowspan="1">As explained in the <a href="#etc">Appendix</a>, this option
+              will keep more accurate statistics about the copy than the
+              default case. It also preserves logs from failed copies, which
+              can be valuable for debugging. Finally, a failing map will not
+              cause the job to fail before all splits are attempted.
+              </td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1"><span class="codefrag">-log &lt;logdir&gt;</span></td>
+              <td colspan="1" rowspan="1">Write logs to &lt;logdir&gt;</td>
+              <td colspan="1" rowspan="1">DistCp keeps logs of each file it attempts to copy as map
+              output. If a map fails, the log output will not be retained if
+              it is re-executed.
+              </td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1"><span class="codefrag">-m &lt;num_maps&gt;</span></td>
+              <td colspan="1" rowspan="1">Maximum number of simultaneous copies</td>
+              <td colspan="1" rowspan="1">Specify the number of maps to copy data. Note that more maps
+              may not necessarily improve throughput.
+              </td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1"><span class="codefrag">-overwrite</span></td>
+              <td colspan="1" rowspan="1">Overwrite destination</td>
+              <td colspan="1" rowspan="1">If a map fails and <span class="codefrag">-i</span> is not specified, all the
+              files in the split, not only those that failed, will be recopied.
+              As discussed in the <a href="#uo">following</a>, it also changes
+              the semantics for generating destination paths, so users should
+              use this carefully.
+              </td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1"><span class="codefrag">-update</span></td>
+              <td colspan="1" rowspan="1">Overwrite if src size different from dst size</td>
+              <td colspan="1" rowspan="1">As noted in the preceding, this is not a "sync"
+              operation. The only criterion examined is the source and
+              destination file sizes; if they differ, the source file
+              replaces the destination file. As discussed in the
+              <a href="#uo">following</a>, it also changes the semantics for
+              generating destination paths, so users should use this carefully.
+              </td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1"><span class="codefrag">-f &lt;urilist_uri&gt;</span></td>
+              <td colspan="1" rowspan="1">Use list at &lt;urilist_uri&gt; as src list</td>
+              <td colspan="1" rowspan="1">This is equivalent to listing each source on the command
+              line. The <span class="codefrag">urilist_uri</span> list should be a fully
+              qualified URI.
+              </td>
+</tr>
+
+        
+</table>
+<a name="N10136"></a><a name="uo"></a>
+<h4>Update and Overwrite</h4>
+<p>It's worth giving some examples of <span class="codefrag">-update</span> and
+        <span class="codefrag">-overwrite</span>. Consider a copy from <span class="codefrag">/foo/a</span> and
+        <span class="codefrag">/foo/b</span> to <span class="codefrag">/bar/foo</span>, where the sources contain
+        the following:</p>
+<p>
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/aa</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/ab</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ba</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ab</span>
+</p>
+<p>If either <span class="codefrag">-update</span> or <span class="codefrag">-overwrite</span> is set,
+        then both sources will map an entry to <span class="codefrag">/bar/foo/ab</span> at the
+        destination. For both options, the contents of each source directory
+        are compared with the <strong>contents</strong> of the destination
+        directory. Rather than permit this conflict, DistCp will abort.</p>
+<p>In the default case, both <span class="codefrag">/bar/foo/a</span> and
+        <span class="codefrag">/bar/foo/b</span> will be created and neither will collide.</p>
+<p>Now consider a legal copy using <span class="codefrag">-update</span>:<br>
+        
+<span class="codefrag">distcp -update hdfs://nn1:8020/foo/a \</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              hdfs://nn1:8020/foo/b \</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              hdfs://nn2:8020/bar</span>
+</p>
+<p>With sources/sizes:</p>
+<p>
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/aa 32</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/ab 32</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ba 64</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/bb 32</span>
+</p>
+<p>And destination/sizes:</p>
+<p>
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/aa 32</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ba 32</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/bb 64</span>
+</p>
+<p>Will effect:</p>
+<p>
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/aa 32</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ab 32</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ba 64</span>
+<br>
+        
+<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/bb 32</span>
+</p>
+<p>Only <span class="codefrag">aa</span> is not overwritten on nn2. If
+        <span class="codefrag">-overwrite</span> were specified, all elements would be
+        overwritten.</p>
+</div> <!-- Usage -->
+
+    
+<a name="N101E7"></a><a name="etc"></a>
+<h2 class="h3">Appendix</h2>
+<div class="section">
+<a name="N101ED"></a><a name="Map+sizing"></a>
+<h3 class="h4">Map sizing</h3>
+<p>DistCp makes a faint attempt to size each map comparably so that
+          each copies roughly the same number of bytes. Note that files are the
+          finest level of granularity, so increasing the number of simultaneous
+          copiers (i.e. maps) may not always increase the number of
+          simultaneous copies nor the overall throughput.</p>
+<p>If <span class="codefrag">-m</span> is not specified, DistCp will attempt to
+          schedule work for <span class="codefrag">min(total_bytes / bytes.per.map, 20 *
+          num_task_trackers)</span> where <span class="codefrag">bytes.per.map</span> defaults
+          to 256MB.</p>
+<p>Tuning the number of maps to the size of the source and
+          destination clusters, the size of the copy, and the available
+          bandwidth is recommended for long-running and regularly run jobs.</p>
+<a name="N10206"></a><a name="cpver"></a>
+<h3 class="h4">Copying between versions of HDFS</h3>
+<p>For copying between two different versions of Hadoop, one will
+        usually use HftpFileSystem. This is a read-only FileSystem, so DistCp
+        must be run on the destination cluster (more specifically, on
+        TaskTrackers that can write to the destination cluster). Each source is
+        specified as <span class="codefrag">hftp://&lt;dfs.http.address&gt;/&lt;path&gt;</span>
+        (the default <span class="codefrag">dfs.http.address</span> is
+        &lt;namenode&gt;:50070).</p>
+<a name="N10216"></a><a name="Map%2FReduce+and+other+side-effects"></a>
+<h3 class="h4">Map/Reduce and other side-effects</h3>
+<p>As has been mentioned in the preceding, should a map fail to copy
+        one of its inputs, there will be several side-effects.</p>
+<ul>
+
+          
+<li>Unless <span class="codefrag">-i</span> is specified, the logs generated by that
+          task attempt will be replaced by the previous attempt.</li>
+
+          
+<li>Unless <span class="codefrag">-overwrite</span> is specified, files successfully
+          copied by a previous map on a re-execution will be marked as
+          "skipped".</li>
+
+          
+<li>If a map fails <span class="codefrag">mapred.map.max.attempts</span> times, the
+          remaining map tasks will be killed (unless <span class="codefrag">-i</span> is
+          set).</li>
+
+          
+<li>If <span class="codefrag">mapred.speculative.execution</span> is set set
+          <span class="codefrag">final</span> and <span class="codefrag">true</span>, the result of the copy is
+          undefined.</li>
+
+        
+</ul>
+</div> <!-- Appendix -->
+
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2007 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

File diff suppressed because it is too large
+ 118 - 0
docs/distcp.pdf


+ 3 - 0
docs/hadoop_archives.html

@@ -129,6 +129,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/hdfs_design.html

@@ -131,6 +131,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/hdfs_permissions_guide.html

@@ -131,6 +131,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/hdfs_quota_admin_guide.html

@@ -131,6 +131,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/hdfs_shell.html

@@ -129,6 +129,9 @@ document.write("Last Published: " + document.lastModified);
 <div class="menupagetitle">FS Shell Guide</div>
 <div class="menupagetitle">FS Shell Guide</div>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/hdfs_user_guide.html

@@ -131,6 +131,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/hod.html

@@ -131,6 +131,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/hod_admin_guide.html

@@ -131,6 +131,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/hod_config_guide.html

@@ -131,6 +131,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/hod_user_guide.html

@@ -131,6 +131,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/index.html

@@ -129,6 +129,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 9 - 0
docs/linkmap.html

@@ -129,6 +129,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
@@ -246,6 +249,12 @@ document.write("Last Published: " + document.lastModified);
 </li>
 </li>
 </ul>
 </ul>
     
     
+<ul>
+<li>
+<a href="distcp.html">DistCp Guide</a>&nbsp;&nbsp;___________________&nbsp;&nbsp;<em>distcp</em>
+</li>
+</ul>
+    
 <ul>
 <ul>
 <li>
 <li>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>&nbsp;&nbsp;___________________&nbsp;&nbsp;<em>mapred</em>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>&nbsp;&nbsp;___________________&nbsp;&nbsp;<em>mapred</em>

+ 12 - 12
docs/linkmap.pdf

@@ -5,10 +5,10 @@
 /Producer (FOP 0.20.5) >>
 /Producer (FOP 0.20.5) >>
 endobj
 endobj
 5 0 obj
 5 0 obj
-<< /Length 1110 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1138 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
  >>
 stream
 stream
-Gatn&?#uJp'Sc)T/%A8*U=D2AkpFYuGP[HOhW`"&Vk\h`1r8+#E2NYa"\Oc=<5jKRbTr>gcMQgE?@BIh0E/I.&Z,OtlcfM)s0;femM74&jqgI#5K5,TKI>/n)uY,l;HfY^(H>16PI("`]&/D$33X1U9BUT6);CL#V.HLXoB<r&rO5X9&].T%P@JS@32BSm?)Y1B_>5NlDHi(E-N`&-P.%J#3Hh]S].9_$7U"pI7$Ti+g6$,mdq-++,WN$'!'<h>RQRj)`L-U9Z3^/<@2o@FF11O*?J#-\(`\4D^&0knj=^So(kFN_IRlQ@0EgN$"@W;X&,QDO48Gp<laGrf$*EK-/fLG%U:EU9Sc,].&FTi@2LbgBTD#SD/,;/oM@i)bpGM5'cm^`+5nY<Pm8S8dC=28@Cs6M8<I&Q#:=.CAN%TEGG?K*kQH`Oi/#g/6AOfGIp"ApN>8qPhP1-[6T0^3'\mCALb9e`5,nDVu%@X/Ee&5<>o$)7g;TaUjn2^M_h7Wa?WVcm($97([pi3gWWK4p0Oj-q7X`8O]9%9)mQXJf/*(LH`cqcjuD-(N3!&*rGWg$Y_Ks"c5q_[Gci5DNSC*"XWK?eONYps9e^/hAS.3OlOg;gGg,\Xt(@X%?Fk$@-9OmC;'=G_'A<afo4lKao+]92nG.ntXCr+L8O]u,-J8>#bIeb;Ni9")(B0a#@MHP"5L&!luJpe!JCR-M@HAY1(Jq]bDEFc$XEYj%`*EEIkfo5dIH&W%XsCPkL+#u[:fTWOfLI]CE]Q/()Wh9:I^"1+n3eC,K[a#N:ajiq-Bhf&pkqsj2q8FXR9:/03*P>B7m?ducX%FeIQ+8p0b2oS-tcm%)bgCnS^i#D2?9nR-+'H_uXX^)H19<76*^_dRI[\m6]L_GjuAO/P.QDG:tZQ=Rsoa/?ll#9>i'H:1U2WtjQfgp`bE&YSU6<&.V:i\qDWNVZeN&eQEG.i,;Y.);S%D`n"/>@bX"IMW;N&HUK`LHj1N6QG5F=g7,9UTrP7fD#1ZR@,M`E)3.hYh-;?(o=9mT7WZfY\9?c8E.O0$2;-aP?cphWRJW3$[$t=67='$okHDjOqhOh)]su%\e<<K5p2V!t`_kF8~>
+Gatn&?#uJp'Sc)T/%A8+U?W]-FIg["48he=:+s%;4"A<;'+>?VBDV'I:hlf5.NY*j-BV[qn"ZZLlJpE,na`,:Qi\+A[L:afb8)eC?(_Il?PK@81JT)94Gbu.khh<+ddfhKM;uOe\Tk0T]:IT8%lWEOl+F4/@s_3W>#r4_&cEM_hABf>+`*E,3>TYc6Yb^@KkgJrXFG<M[`-mH,d8:>C"p4\@je=$qbh1l8ZUX<W,bC"4&W1u\kI=-"@mVL?tUoZP=P>M(pcF8(;W0dN'fj[NqTm%lae/GK7i[n^]#2Tnm;IX(kFN_IRlQ?0EgN$"@W;X&,Qt_48>j;lZQmsKN<4Q(QD7#d9NBXcMgcOM"_Ds2LbgB*tGS:./>QdM@i)bpGM5Gcm^`+5nY<Pku;iXC=28@Cs6M8<Om=j:=.CAN%TEC[jo!jb@W<p[FF%!Z6Q75qWrKbXc5:Dalup,:EX"O?*?c8AWmk+Op;fK##/%3C#Z0/q0'=T.:keppmds\2m7^T.R+$%!m\;/IN!*BlB?0Ob%@R*ei:9i->H/s7kjHN:8<P+kU3#!VQ7+DJ=$(,<Q_Dk6J'PKI2[/lE8jfe1_,\e60GeR=;S7oT;u#P'G*lbm8I,4&iAO3almL>Ck*?6_L98U-tLf-"JKPVIIG,Rnrq;0bh:&rSLFXeb1T](qfRMe]&nt!aF'G!.0&22^,K9[_;RNEp5f0]l<>JO-/7,2QiU[74c!%Q&Ic6hXW&^J`GX#TbQu&!839;3@)FE#3pLDlLQ1:I5DR1Iis<N9VDndN+uNpQgsDXSI]19[Q/('QkKJNhQJ#7klKLm\jo?*9f(.ic%c;5)RlgFj36?DS@]uZn#Ru+EMn2inJU2\<d3"*^;`oj?/TSP;3>9FB^(pC+1alp$;nNdD[!"Sb/ZkU/JMA-&(Jh<C&Z6br/fO;u-Ws'h3pYc%>+ad.Qk!bS:U-$nAV3`tZ8cY'GF)VNU:NL?P+*/+lK`"G@h,.0[9;8qGOXQXM-pa$(=>Dg!mIB.7bbo,i`1JPa6#O0]XoONVTcT^U](>2=d0Q7@OngcpYQp4>NkTTrY74Gp+V0)F-ePMS,@:[1:P$$\,&BIcBIS*X7G58#b94Wnn_scDOnKuVr`QT_)V'/<AESW~>
 endstream
 endstream
 endobj
 endobj
 6 0 obj
 6 0 obj
@@ -72,17 +72,17 @@ endobj
 xref
 xref
 0 12
 0 12
 0000000000 65535 f 
 0000000000 65535 f 
-0000001934 00000 n 
-0000001992 00000 n 
-0000002042 00000 n 
+0000001962 00000 n 
+0000002020 00000 n 
+0000002070 00000 n 
 0000000015 00000 n 
 0000000015 00000 n 
 0000000071 00000 n 
 0000000071 00000 n 
-0000001273 00000 n 
-0000001379 00000 n 
-0000001491 00000 n 
-0000001600 00000 n 
-0000001710 00000 n 
-0000001818 00000 n 
+0000001301 00000 n 
+0000001407 00000 n 
+0000001519 00000 n 
+0000001628 00000 n 
+0000001738 00000 n 
+0000001846 00000 n 
 trailer
 trailer
 <<
 <<
 /Size 12
 /Size 12
@@ -90,5 +90,5 @@ trailer
 /Info 4 0 R
 /Info 4 0 R
 >>
 >>
 startxref
 startxref
-2162
+2190
 %%EOF
 %%EOF

+ 3 - 0
docs/mapred_tutorial.html

@@ -128,6 +128,9 @@ document.write("Last Published: " + document.lastModified);
 <div class="menuitem">
 <div class="menuitem">
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
+<div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
 <div class="menupage">
 <div class="menupage">
 <div class="menupagetitle">Map-Reduce Tutorial</div>
 <div class="menupagetitle">Map-Reduce Tutorial</div>
 </div>
 </div>

+ 3 - 0
docs/native_libraries.html

@@ -129,6 +129,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menupage">
 <div class="menupage">

+ 3 - 0
docs/quickstart.html

@@ -129,6 +129,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 3 - 0
docs/streaming.html

@@ -132,6 +132,9 @@ document.write("Last Published: " + document.lastModified);
 <a href="hdfs_shell.html">FS Shell Guide</a>
 <a href="hdfs_shell.html">FS Shell Guide</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">
+<a href="distcp.html">DistCp Guide</a>
+</div>
+<div class="menuitem">
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
 </div>
 </div>
 <div class="menuitem">
 <div class="menuitem">

+ 323 - 0
src/docs/src/documentation/content/xdocs/distcp.xml

@@ -0,0 +1,323 @@
+<?xml version="1.0"?>
+<!--
+  Copyright 2002-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+
+  <header>
+    <title>DistCp</title>
+  </header>
+
+  <body>
+
+    <section>
+      <title>Overview</title>
+
+      <p>DistCp (distributed copy) is a tool used for large inter/intra-cluster
+      copying. It uses map/reduce to effect its distribution, error
+      handling/recovery, and reporting. It expands a list of files and
+      directories into input to map tasks, each of which will copy a partition
+      of the files specified in the source list. Its map/reduce pedigree has
+      endowed it with some quirks in both its semantics and execution. The
+      purpose of this document is to offer guidance for common tasks and to
+      elucidate its model.</p>
+
+    </section>
+
+    <section>
+      <title>Usage</title>
+
+      <section>
+        <title>Basic</title>
+        <p>The most common invocation of DistCp is an inter-cluster copy:</p>
+        <p><code>bash$ hadoop distcp hdfs://nn1:8020/foo/bar \</code><br/>
+           <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 hdfs://nn2:8020/bar/foo</code></p>
+
+        <p>This will expand the namespace under <code>/foo/bar</code> on nn1
+        into a temporary file, partition its contents among a set of map
+        tasks, and start a copy on each TaskTracker from nn1 to nn2. Note
+        that DistCp expects absolute paths.</p>
+
+        <p>One can also specify multiple source directories on the command
+        line:</p>
+        <p><code>bash$ hadoop distcp hdfs://nn1:8020/foo/a \</code><br/>
+           <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 hdfs://nn1:8020/foo/b \</code><br/>
+           <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                 hdfs://nn2:8020/bar/foo</code></p>
+
+        <p>Or, equivalently, from a file using the <code>-f</code> option:<br/>
+        <code>bash$ hadoop distcp -f hdfs://nn1:8020/srclist \</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              &nbsp;hdfs://nn2:8020/bar/foo</code><br/></p>
+
+        <p>Where <code>srclist</code> contains<br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</code></p>
+
+        <p>When copying from multiple sources, DistCp will abort the copy with
+        an error message if two sources collide, but collisions at the
+        destination are resolved per the <a href="#options">options</a>
+        specified. By default, files already existing at the destination are
+        skipped (i.e. not replaced by the source file). A count of skipped
+        files is reported at the end of each job, but it may be inaccurate if a
+        copier failed for some subset of its files, but succeeded on a later
+        attempt (see <a href="#etc">Appendix</a>).</p>
+
+        <p>It is important that each TaskTracker can reach and communicate with
+        both the source and destination filesystems. For hdfs, both the source
+        and destination must be running the same version of the protocol or use
+        a backwards-compatible protocol (see <a href="#cpver">Copying Between
+        Versions</a>).</p>
+
+        <p>After a copy, it is recommended that one generates and cross-checks
+        a listing of the source and destination to verify that the copy was
+        truly successful. Since DistCp employs both map/reduce and the
+        FileSystem API, issues in or between any of the three could adversely
+        and silently affect the copy. Some have had success running with
+        <code>-update</code> enabled to perform a second pass, but users should
+        be acquainted with its semantics before attempting this.</p>
+
+        <p>It's also worth noting that if another client is still writing to a
+        source file, the copy will likely fail. Attempting to overwrite a file
+        being written at the destination should also fail on HDFS. If a source
+        file is (re)moved before it is copied, the copy will fail with a
+        FileNotFoundException.</p>
+
+      </section> <!-- Basic -->
+
+      <section id="options">
+        <title>Options</title>
+
+        <section>
+        <title>Option Index</title>
+        <table>
+          <tr><th> Flag </th><th> Description </th><th> Notes </th></tr>
+
+          <tr><td><code>-p[rbugp]</code></td>
+              <td>Preserve<br/>
+                  &nbsp;&nbsp;r: replication number<br/>
+                  &nbsp;&nbsp;b: block size<br/>
+                  &nbsp;&nbsp;u: user<br/>
+                  &nbsp;&nbsp;g: group<br/>
+                  &nbsp;&nbsp;p: permission<br/></td>
+              <td>Modification times are not preserved. Also, when
+              <code>-update</code> is specified, status updates will
+              <strong>not</strong> be synchronized unless the file sizes
+              also differ (i.e. unless the file is re-created).
+              </td></tr>
+          <tr><td><code>-i</code></td>
+              <td>Ignore failures</td>
+              <td>As explained in the <a href="#etc">Appendix</a>, this option
+              will keep more accurate statistics about the copy than the
+              default case. It also preserves logs from failed copies, which
+              can be valuable for debugging. Finally, a failing map will not
+              cause the job to fail before all splits are attempted.
+              </td></tr>
+          <tr><td><code>-log &lt;logdir&gt;</code></td>
+              <td>Write logs to &lt;logdir&gt;</td>
+              <td>DistCp keeps logs of each file it attempts to copy as map
+              output. If a map fails, the log output will not be retained if
+              it is re-executed.
+              </td></tr>
+          <tr><td><code>-m &lt;num_maps&gt;</code></td>
+              <td>Maximum number of simultaneous copies</td>
+              <td>Specify the number of maps to copy data. Note that more maps
+              may not necessarily improve throughput.
+              </td></tr>
+          <tr><td><code>-overwrite</code></td>
+              <td>Overwrite destination</td>
+              <td>If a map fails and <code>-i</code> is not specified, all the
+              files in the split, not only those that failed, will be recopied.
+              As discussed in the <a href="#uo">following</a>, it also changes
+              the semantics for generating destination paths, so users should
+              use this carefully.
+              </td></tr>
+          <tr><td><code>-update</code></td>
+              <td>Overwrite if src size different from dst size</td>
+              <td>As noted in the preceding, this is not a &quot;sync&quot;
+              operation. The only criterion examined is the source and
+              destination file sizes; if they differ, the source file
+              replaces the destination file. As discussed in the
+              <a href="#uo">following</a>, it also changes the semantics for
+              generating destination paths, so users should use this carefully.
+              </td></tr>
+          <tr><td><code>-f &lt;urilist_uri&gt;</code></td>
+              <td>Use list at &lt;urilist_uri&gt; as src list</td>
+              <td>This is equivalent to listing each source on the command
+              line. The <code>urilist_uri</code> list should be a fully
+              qualified URI.
+              </td></tr>
+
+        </table>
+
+      </section>
+
+      <section id="uo">
+        <title>Update and Overwrite</title>
+
+        <p>It's worth giving some examples of <code>-update</code> and
+        <code>-overwrite</code>. Consider a copy from <code>/foo/a</code> and
+        <code>/foo/b</code> to <code>/bar/foo</code>, where the sources contain
+        the following:</p>
+
+        <p><code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/aa</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/ab</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ba</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ab</code></p>
+
+        <p>If either <code>-update</code> or <code>-overwrite</code> is set,
+        then both sources will map an entry to <code>/bar/foo/ab</code> at the
+        destination. For both options, the contents of each source directory
+        are compared with the <strong>contents</strong> of the destination
+        directory. Rather than permit this conflict, DistCp will abort.</p>
+
+        <p>In the default case, both <code>/bar/foo/a</code> and
+        <code>/bar/foo/b</code> will be created and neither will collide.</p>
+
+        <p>Now consider a legal copy using <code>-update</code>:<br/>
+        <code>distcp -update hdfs://nn1:8020/foo/a \</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              hdfs://nn1:8020/foo/b \</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+              hdfs://nn2:8020/bar</code></p>
+
+        <p>With sources/sizes:</p>
+
+        <p><code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/aa 32</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/ab 32</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ba 64</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/bb 32</code></p>
+
+        <p>And destination/sizes:</p>
+
+        <p><code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/aa 32</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ba 32</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/bb 64</code></p>
+
+        <p>Will effect:</p>
+
+        <p><code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/aa 32</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ab 32</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ba 64</code><br/>
+        <code>&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/bb 32</code></p>
+
+        <p>Only <code>aa</code> is not overwritten on nn2. If
+        <code>-overwrite</code> were specified, all elements would be
+        overwritten.</p>
+
+      </section> <!-- Update and Overwrite -->
+
+      </section> <!-- Options -->
+
+    </section> <!-- Usage -->
+
+    <section id="etc">
+      <title>Appendix</title>
+
+      <section>
+        <title>Map sizing</title>
+
+          <p>DistCp makes a faint attempt to size each map comparably so that
+          each copies roughly the same number of bytes. Note that files are the
+          finest level of granularity, so increasing the number of simultaneous
+          copiers (i.e. maps) may not always increase the number of
+          simultaneous copies nor the overall throughput.</p>
+
+          <p>If <code>-m</code> is not specified, DistCp will attempt to
+          schedule work for <code>min(total_bytes / bytes.per.map, 20 *
+          num_task_trackers)</code> where <code>bytes.per.map</code> defaults
+          to 256MB.</p>
+
+          <p>Tuning the number of maps to the size of the source and
+          destination clusters, the size of the copy, and the available
+          bandwidth is recommended for long-running and regularly run jobs.</p>
+
+      </section>
+
+      <section id="cpver">
+        <title>Copying between versions of HDFS</title>
+
+        <p>For copying between two different versions of Hadoop, one will
+        usually use HftpFileSystem. This is a read-only FileSystem, so DistCp
+        must be run on the destination cluster (more specifically, on
+        TaskTrackers that can write to the destination cluster). Each source is
+        specified as <code>hftp://&lt;dfs.http.address&gt;/&lt;path&gt;</code>
+        (the default <code>dfs.http.address</code> is
+        &lt;namenode&gt;:50070).</p>
+
+      </section>
+
+      <section>
+        <title>Map/Reduce and other side-effects</title>
+
+        <p>As has been mentioned in the preceding, should a map fail to copy
+        one of its inputs, there will be several side-effects.</p>
+
+        <ul>
+
+          <li>Unless <code>-i</code> is specified, the logs generated by that
+          task attempt will be replaced by the previous attempt.</li>
+
+          <li>Unless <code>-overwrite</code> is specified, files successfully
+          copied by a previous map on a re-execution will be marked as
+          &quot;skipped&quot;.</li>
+
+          <li>If a map fails <code>mapred.map.max.attempts</code> times, the
+          remaining map tasks will be killed (unless <code>-i</code> is
+          set).</li>
+
+          <li>If <code>mapred.speculative.execution</code> is set set
+          <code>final</code> and <code>true</code>, the result of the copy is
+          undefined.</li>
+
+        </ul>
+
+      </section>
+
+      <!--
+      <section>
+        <title>Firewalls and SSL</title>
+
+        <p>To copy over HTTP, use the HftpFileSystem as described in the
+        preceding <a href="#cpver">section</a>, and ensure that the required
+        port(s) are open.</p>
+
+        <p>TODO</p>
+
+      </section>
+      -->
+
+    </section> <!-- Appendix -->
+
+  </body>
+
+</document>

+ 1 - 0
src/docs/src/documentation/content/xdocs/site.xml

@@ -40,6 +40,7 @@ See http://forrest.apache.org/docs/linking.html for more info.
     <hdfs      label="HDFS Permissions Guide"    href="hdfs_permissions_guide.html" />
     <hdfs      label="HDFS Permissions Guide"    href="hdfs_permissions_guide.html" />
     <hdfs      label="HDFS Quotas Administrator Guide" href="hdfs_quota_admin_guide.html" />
     <hdfs      label="HDFS Quotas Administrator Guide" href="hdfs_quota_admin_guide.html" />
     <fs        label="FS Shell Guide"     href="hdfs_shell.html" />
     <fs        label="FS Shell Guide"     href="hdfs_shell.html" />
+    <distcp    label="DistCp Guide"       href="distcp.html" />
     <mapred    label="Map-Reduce Tutorial" href="mapred_tutorial.html" />
     <mapred    label="Map-Reduce Tutorial" href="mapred_tutorial.html" />
     <mapred    label="Native Hadoop Libraries" href="native_libraries.html" />
     <mapred    label="Native Hadoop Libraries" href="native_libraries.html" />
     <streaming label="Streaming"          href="streaming.html" />
     <streaming label="Streaming"          href="streaming.html" />

Some files were not shown because too many files changed in this diff