hdfs_user_guide.html 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743
  1. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  2. <html>
  3. <head>
  4. <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
  5. <meta content="Apache Forrest" name="Generator">
  6. <meta name="Forrest-version" content="0.8">
  7. <meta name="Forrest-skin-name" content="pelt">
  8. <title>
  9. Hadoop DFS User Guide
  10. </title>
  11. <link type="text/css" href="skin/basic.css" rel="stylesheet">
  12. <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
  13. <link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
  14. <link type="text/css" href="skin/profile.css" rel="stylesheet">
  15. <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
  16. <link rel="shortcut icon" href="images/favicon.ico">
  17. </head>
  18. <body onload="init()">
  19. <script type="text/javascript">ndeSetTextSize();</script>
  20. <div id="top">
  21. <!--+
  22. |breadtrail
  23. +-->
  24. <div class="breadtrail">
  25. <a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/core/">Core</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
  26. </div>
  27. <!--+
  28. |header
  29. +-->
  30. <div class="header">
  31. <!--+
  32. |start group logo
  33. +-->
  34. <div class="grouplogo">
  35. <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
  36. </div>
  37. <!--+
  38. |end group logo
  39. +-->
  40. <!--+
  41. |start Project Logo
  42. +-->
  43. <div class="projectlogo">
  44. <a href="http://hadoop.apache.org/core/"><img class="logoImage" alt="Hadoop" src="images/core-logo.gif" title="Scalable Computing Platform"></a>
  45. </div>
  46. <!--+
  47. |end Project Logo
  48. +-->
  49. <!--+
  50. |start Search
  51. +-->
  52. <div class="searchbox">
  53. <form action="http://www.google.com/search" method="get" class="roundtopsmall">
  54. <input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
  55. <input name="Search" value="Search" type="submit">
  56. </form>
  57. </div>
  58. <!--+
  59. |end search
  60. +-->
  61. <!--+
  62. |start Tabs
  63. +-->
  64. <ul id="tabs">
  65. <li>
  66. <a class="unselected" href="http://hadoop.apache.org/core/">Project</a>
  67. </li>
  68. <li>
  69. <a class="unselected" href="http://wiki.apache.org/hadoop">Wiki</a>
  70. </li>
  71. <li class="current">
  72. <a class="selected" href="index.html">Hadoop 0.18 Documentation</a>
  73. </li>
  74. </ul>
  75. <!--+
  76. |end Tabs
  77. +-->
  78. </div>
  79. </div>
  80. <div id="main">
  81. <div id="publishedStrip">
  82. <!--+
  83. |start Subtabs
  84. +-->
  85. <div id="level2tabs"></div>
  86. <!--+
  87. |end Endtabs
  88. +-->
  89. <script type="text/javascript"><!--
  90. document.write("Last Published: " + document.lastModified);
  91. // --></script>
  92. </div>
  93. <!--+
  94. |breadtrail
  95. +-->
  96. <div class="breadtrail">
  97. &nbsp;
  98. </div>
  99. <!--+
  100. |start Menu, mainarea
  101. +-->
  102. <!--+
  103. |start Menu
  104. +-->
  105. <div id="menu">
  106. <div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Documentation</div>
  107. <div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
  108. <div class="menuitem">
  109. <a href="index.html">Overview</a>
  110. </div>
  111. <div class="menuitem">
  112. <a href="quickstart.html">Quickstart</a>
  113. </div>
  114. <div class="menuitem">
  115. <a href="cluster_setup.html">Cluster Setup</a>
  116. </div>
  117. <div class="menuitem">
  118. <a href="hdfs_design.html">HDFS Architecture</a>
  119. </div>
  120. <div class="menupage">
  121. <div class="menupagetitle">HDFS User Guide</div>
  122. </div>
  123. <div class="menuitem">
  124. <a href="hdfs_permissions_guide.html">HDFS Permissions Guide</a>
  125. </div>
  126. <div class="menuitem">
  127. <a href="hdfs_quota_admin_guide.html">HDFS Quotas Administrator Guide</a>
  128. </div>
  129. <div class="menuitem">
  130. <a href="hdfs_shell.html">FS Shell Guide</a>
  131. </div>
  132. <div class="menuitem">
  133. <a href="mapred_tutorial.html">Map-Reduce Tutorial</a>
  134. </div>
  135. <div class="menuitem">
  136. <a href="native_libraries.html">Native Hadoop Libraries</a>
  137. </div>
  138. <div class="menuitem">
  139. <a href="streaming.html">Streaming</a>
  140. </div>
  141. <div class="menuitem">
  142. <a href="hadoop_archives.html">Hadoop Archives</a>
  143. </div>
  144. <div class="menuitem">
  145. <a href="hod.html">Hadoop On Demand</a>
  146. </div>
  147. <div class="menuitem">
  148. <a href="api/index.html">API Docs</a>
  149. </div>
  150. <div class="menuitem">
  151. <a href="http://wiki.apache.org/hadoop/">Wiki</a>
  152. </div>
  153. <div class="menuitem">
  154. <a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a>
  155. </div>
  156. <div class="menuitem">
  157. <a href="http://hadoop.apache.org/core/mailing_lists.html">Mailing Lists</a>
  158. </div>
  159. <div class="menuitem">
  160. <a href="releasenotes.html">Release Notes</a>
  161. </div>
  162. <div class="menuitem">
  163. <a href="changes.html">All Changes</a>
  164. </div>
  165. </div>
  166. <div id="credit"></div>
  167. <div id="roundbottom">
  168. <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
  169. <!--+
  170. |alternative credits
  171. +-->
  172. <div id="credit2"></div>
  173. </div>
  174. <!--+
  175. |end Menu
  176. +-->
  177. <!--+
  178. |start content
  179. +-->
  180. <div id="content">
  181. <div title="Portable Document Format" class="pdflink">
  182. <a class="dida" href="hdfs_user_guide.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
  183. PDF</a>
  184. </div>
  185. <h1>
  186. Hadoop DFS User Guide
  187. </h1>
  188. <div id="minitoc-area">
  189. <ul class="minitoc">
  190. <li>
  191. <a href="#Purpose">Purpose</a>
  192. </li>
  193. <li>
  194. <a href="#Overview"> Overview </a>
  195. </li>
  196. <li>
  197. <a href="#Pre-requisites"> Pre-requisites </a>
  198. </li>
  199. <li>
  200. <a href="#Web+Interface"> Web Interface </a>
  201. </li>
  202. <li>
  203. <a href="#Shell+Commands">Shell Commands</a>
  204. <ul class="minitoc">
  205. <li>
  206. <a href="#DFSAdmin+Command"> DFSAdmin Command </a>
  207. </li>
  208. </ul>
  209. </li>
  210. <li>
  211. <a href="#Secondary+Namenode"> Secondary Namenode </a>
  212. </li>
  213. <li>
  214. <a href="#Rebalancer"> Rebalancer </a>
  215. </li>
  216. <li>
  217. <a href="#Rack+Awareness"> Rack Awareness </a>
  218. </li>
  219. <li>
  220. <a href="#Safemode"> Safemode </a>
  221. </li>
  222. <li>
  223. <a href="#Fsck"> Fsck </a>
  224. </li>
  225. <li>
  226. <a href="#Upgrade+and+Rollback"> Upgrade and Rollback </a>
  227. </li>
  228. <li>
  229. <a href="#File+Permissions+and+Security"> File Permissions and Security </a>
  230. </li>
  231. <li>
  232. <a href="#Scalability"> Scalability </a>
  233. </li>
  234. <li>
  235. <a href="#Related+Documentation"> Related Documentation </a>
  236. </li>
  237. </ul>
  238. </div>
  239. <a name="N1000D"></a><a name="Purpose"></a>
  240. <h2 class="h3">Purpose</h2>
  241. <div class="section">
  242. <p>
  243. This document aims to be the starting point for users working with
  244. Hadoop Distributed File System (HDFS) either as a part of a
  245. <a href="http://hadoop.apache.org/">Hadoop</a>
  246. cluster or as a stand-alone general purpose distributed file system.
  247. While HDFS is designed to "just-work" in many environments, a working
  248. knowledge of HDFS helps greatly with configuration improvements and
  249. diagnostics on a specific cluster.
  250. </p>
  251. </div>
  252. <a name="N1001B"></a><a name="Overview"></a>
  253. <h2 class="h3"> Overview </h2>
  254. <div class="section">
  255. <p>
  256. HDFS is the primary distributed storage used by Hadoop applications. A
  257. HDFS cluster primarily consists of a <em>NameNode</em> that manages the
  258. filesystem metadata and Datanodes that store the actual data. The
  259. architecture of HDFS is described in detail
  260. <a href="hdfs_design.html">here</a>. This user guide primarily deals with
  261. interaction of users and administrators with HDFS clusters.
  262. The <a href="images/hdfsarchitecture.gif">diagram</a> from
  263. <a href="hdfs_design.html">HDFS architecture</a> depicts
  264. basic interactions among Namenode, Datanodes, and the clients. Eseentially,
  265. clients contact Namenode for file metadata or file modifications and perform
  266. actual file I/O directly with the datanodes.
  267. </p>
  268. <p>
  269. The following are some of the salient features that could be of
  270. interest to many users. The terms in <em>italics</em>
  271. are described in later sections.
  272. </p>
  273. <ul>
  274. <li>
  275. Hadoop, including HDFS, is well suited for distributed storage
  276. and distributed processing using commodity hardware. It is fault
  277. tolerant, scalable, and extremely simple to expand.
  278. <a href="mapred_tutorial.html">Map-Reduce</a>,
  279. well known for its simplicity and applicability for large set of
  280. distributed applications, is an integral part of Hadoop.
  281. </li>
  282. <li>
  283. HDFS is highly configurable with a default configuration well
  284. suited for many installations. Most of the time, configuration
  285. needs to be tuned only for very large clusters.
  286. </li>
  287. <li>
  288. It is written in Java and is supported on all major platforms.
  289. </li>
  290. <li>
  291. Supports <em>shell like commands</em> to interact with HDFS directly.
  292. </li>
  293. <li>
  294. Namenode and Datanodes have built in web servers that makes it
  295. easy to check current status of the cluster.
  296. </li>
  297. <li>
  298. New features and improvements are regularly implemented in HDFS.
  299. The following is a subset of useful features in HDFS:
  300. <ul>
  301. <li>
  302. <em>File permissions and authentication.</em>
  303. </li>
  304. <li>
  305. <em>Rack awareness</em> : to take a node's physical location into
  306. account while scheduling tasks and allocating storage.
  307. </li>
  308. <li>
  309. <em>Safemode</em> : an administrative mode for maintanance.
  310. </li>
  311. <li>
  312. <em>fsck</em> : an utility to diagnose health of the filesystem, to
  313. find missing files or blocks.
  314. </li>
  315. <li>
  316. <em>Rebalancer</em> : tool to balance the cluster when the data is
  317. unevenly distributed among datanodes.
  318. </li>
  319. <li>
  320. <em>Upgrade and Rollback</em> : after a software upgrade,
  321. it is possible to
  322. rollback to HDFS' state before the upgrade in case of unexpected
  323. problems.
  324. </li>
  325. <li>
  326. <em>Secondary Namenode</em> : helps keep the size of file
  327. containing log of HDFS modification with in certain limit at
  328. the Namenode.
  329. </li>
  330. </ul>
  331. </li>
  332. </ul>
  333. </div>
  334. <a name="N10083"></a><a name="Pre-requisites"></a>
  335. <h2 class="h3"> Pre-requisites </h2>
  336. <div class="section">
  337. <p>
  338. The following documents describe installation and set up of a
  339. Hadoop cluster :
  340. </p>
  341. <ul>
  342. <li>
  343. <a href="quickstart.html">Hadoop Quickstart</a>
  344. for first-time users.
  345. </li>
  346. <li>
  347. <a href="cluster_setup.html">Hadoop Cluster Setup</a>
  348. for large, distributed clusters.
  349. </li>
  350. </ul>
  351. <p>
  352. The rest of document assumes the user is able to set up and run a
  353. HDFS with at least one Datanode. For the purpose of this document,
  354. both Namenode and Datanode could be running on the same physical
  355. machine.
  356. </p>
  357. </div>
  358. <a name="N100A1"></a><a name="Web+Interface"></a>
  359. <h2 class="h3"> Web Interface </h2>
  360. <div class="section">
  361. <p>
  362. Namenode and Datanode each run an internal web server in order to
  363. display basic information about the current status of the cluster.
  364. With the default configuration, namenode front page is at
  365. <span class="codefrag">http://namenode:50070/</span> .
  366. It lists the datanodes in the cluster and basic stats of the
  367. cluster. The web interface can also be used to browse the file
  368. system (using "Browse the file system" link on the Namenode front
  369. page).
  370. </p>
  371. </div>
  372. <a name="N100AE"></a><a name="Shell+Commands"></a>
  373. <h2 class="h3">Shell Commands</h2>
  374. <div class="section">
  375. <p>
  376. Hadoop includes various "shell-like" commands that directly
  377. interact with HDFS and other file systems that Hadoop supports.
  378. The command
  379. <span class="codefrag">bin/hadoop fs -help</span>
  380. lists the commands supported by Hadoop
  381. shell. Further,
  382. <span class="codefrag">bin/hadoop fs -help command</span>
  383. displays more detailed help on a command. The commands support
  384. most of the normal filesystem operations like copying files,
  385. changing file permissions, etc. It also supports a few HDFS
  386. specific operations like changing replication of files.
  387. </p>
  388. <a name="N100BD"></a><a name="DFSAdmin+Command"></a>
  389. <h3 class="h4"> DFSAdmin Command </h3>
  390. <p>
  391. <span class="codefrag">'bin/hadoop dfsadmin'</span>
  392. command supports a few HDFS administration related operations.
  393. <span class="codefrag">bin/hadoop dfsadmin -help</span>
  394. lists all the commands currently supported. For e.g.:
  395. </p>
  396. <ul>
  397. <li>
  398. <span class="codefrag">-report</span>
  399. : reports basic stats of HDFS. Some of this information is
  400. also available on the Namenode front page.
  401. </li>
  402. <li>
  403. <span class="codefrag">-safemode</span>
  404. : though usually not required, an administrator can manually enter
  405. or leave <em>safemode</em>.
  406. </li>
  407. <li>
  408. <span class="codefrag">-finalizeUpgrade</span>
  409. : removes previous backup of the cluster made during last upgrade.
  410. </li>
  411. </ul>
  412. </div>
  413. <a name="N100E6"></a><a name="Secondary+Namenode"></a>
  414. <h2 class="h3"> Secondary Namenode </h2>
  415. <div class="section">
  416. <p>
  417. Namenode stores modifications to the filesystem as a log
  418. appended to a native filesystem file (<span class="codefrag">edits</span>).
  419. When a Namenode starts up, it reads HDFS state from an image
  420. file (<span class="codefrag">fsimage</span>) and then applies <em>edits</em> from
  421. edits log file. It then writes new HDFS state to (<span class="codefrag">fsimage</span>)
  422. and starts normal
  423. operation with an empty edits file. Since namenode merges
  424. <span class="codefrag">fsimage</span> and <span class="codefrag">edits</span> files only during start up,
  425. edits file could get very large over time on a large cluster.
  426. Another side effect of larger edits file is that next
  427. restart of Namenade takes longer.
  428. </p>
  429. <p>
  430. The secondary namenode merges fsimage and edits log periodically
  431. and keeps edits log size with in a limit. It is usually run on a
  432. different machine than the primary Namenode since its memory requirements
  433. are on the same order as the primary namemode. The secondary
  434. namenode is started by <span class="codefrag">bin/start-dfs.sh</span> on the nodes
  435. specified in <span class="codefrag">conf/masters</span> file.
  436. </p>
  437. </div>
  438. <a name="N1010B"></a><a name="Rebalancer"></a>
  439. <h2 class="h3"> Rebalancer </h2>
  440. <div class="section">
  441. <p>
  442. HDFS data might not always be be placed uniformly across the
  443. datanode. One common reason is addition of new datanodes to an
  444. existing cluster. While placing new <em>blocks</em> (data for a file is
  445. stored as a series of blocks), Namenode considers various
  446. parameters before choosing the datanodes to receive these blocks.
  447. Some of the considerations are :
  448. </p>
  449. <ul>
  450. <li>
  451. Policy to keep one of the replicas of a block on the same node
  452. as the node that is writing the block.
  453. </li>
  454. <li>
  455. Need to spread different replicas of a block across the racks so
  456. that cluster can survive loss of whole rack.
  457. </li>
  458. <li>
  459. One of the replicas is usually placed on the same rack as the
  460. node writing to the file so that cross-rack network I/O is
  461. reduced.
  462. </li>
  463. <li>
  464. Spread HDFS data uniformly across the datanodes in the cluster.
  465. </li>
  466. </ul>
  467. <p>
  468. Due to multiple competing considerations, data might not be
  469. uniformly placed across the datanodes.
  470. HDFS provides a tool for administrators that analyzes block
  471. placement and relanaces data across the datnodes. A brief
  472. adminstrator's guide for rebalancer as a
  473. <a href="http://issues.apache.org/jira/secure/attachment/12368261/RebalanceDesign6.pdf">PDF</a>
  474. is attached to
  475. <a href="http://issues.apache.org/jira/browse/HADOOP-1652">HADOOP-1652</a>.
  476. </p>
  477. </div>
  478. <a name="N10132"></a><a name="Rack+Awareness"></a>
  479. <h2 class="h3"> Rack Awareness </h2>
  480. <div class="section">
  481. <p>
  482. Typically large Hadoop clusters are arranged in <em>racks</em> and
  483. network traffic between different nodes with in the same rack is
  484. much more desirable than network traffic across the racks. In
  485. addition Namenode tries to place replicas of block on
  486. multiple racks for improved fault tolerance. Hadoop lets the
  487. cluster administrators decide which <em>rack</em> a node belongs to
  488. through configuration variable <span class="codefrag">dfs.network.script</span>. When this
  489. script is configured, each node runs the script to determine its
  490. <em>rackid</em>. A default installation assumes all the nodes belong to
  491. the same rack. This feature and configuration is further described
  492. in <a href="http://issues.apache.org/jira/secure/attachment/12345251/Rack_aware_HDFS_proposal.pdf">PDF</a>
  493. attached to
  494. <a href="http://issues.apache.org/jira/browse/HADOOP-692">HADOOP-692</a>.
  495. </p>
  496. </div>
  497. <a name="N10150"></a><a name="Safemode"></a>
  498. <h2 class="h3"> Safemode </h2>
  499. <div class="section">
  500. <p>
  501. During start up Namenode loads the filesystem state from
  502. <em>fsimage</em> and <em>edits</em> log file. It then waits for datanodes
  503. to report their blocks so that it does not prematurely start
  504. replicating the blocks though enough replicas already exist in the
  505. cluster. During this time Namenode stays in <em>safemode</em>. A
  506. <em>Safemode</em>
  507. for Namenode is essentially a read-only mode for the HDFS cluster,
  508. where it does not allow any modifications to filesystem or blocks.
  509. Normally Namenode gets out of safemode automatically at
  510. the beginning. If required, HDFS could be placed in safemode explicitly
  511. using <span class="codefrag">'bin/hadoop dfsadmin -safemode'</span> command. Namenode front
  512. page shows whether safemode is on or off. A more detailed
  513. description and configuration is maintained as JavaDoc for
  514. <a href="http://hadoop.apache.org/core/docs/current/api/org/apache/hadoop/dfs/NameNode.html#setSafeMode(org.apache.hadoop.dfs.FSConstants.SafeModeAction)"><span class="codefrag">setSafeMode()</span></a>.
  515. </p>
  516. </div>
  517. <a name="N1016E"></a><a name="Fsck"></a>
  518. <h2 class="h3"> Fsck </h2>
  519. <div class="section">
  520. <p>
  521. HDFS supports <span class="codefrag">fsck</span> command to check for various
  522. inconsistencies.
  523. It it is designed for reporting problems with various
  524. files, for e.g. missing blocks for a file or under replicated
  525. blocks. Unlike a traditional fsck utility for native filesystems,
  526. this command does not correct the errors it detects. Normally Namenode
  527. automatically corrects most of the recoverable failures. By default
  528. fsck ignores open files but provides an option to select during reporting.
  529. HDFS' fsck is not a
  530. Hadoop shell command. It can be run as '<span class="codefrag">bin/hadoop fsck</span>'.
  531. Fsck can be run on the whole filesystem or on a subset of files.
  532. </p>
  533. </div>
  534. <a name="N1017E"></a><a name="Upgrade+and+Rollback"></a>
  535. <h2 class="h3"> Upgrade and Rollback </h2>
  536. <div class="section">
  537. <p>
  538. When Hadoop is upgraded on an existing cluster, as with any
  539. software upgrade, it is possible there are new bugs or
  540. incompatible changes that affect existing applications and were
  541. not discovered earlier. In any non-trivial HDFS installation, it
  542. is not an option to loose any data, let alone to restart HDFS from
  543. scratch. HDFS allows administrators to go back to earlier version
  544. of Hadoop and <em>roll back</em> the cluster to the state it was in
  545. before
  546. the upgrade. HDFS upgrade is described in more detail in
  547. <a href="http://wiki.apache.org/hadoop/Hadoop%20Upgrade">upgrade wiki</a>.
  548. HDFS can have one such backup at a time. Before upgrading,
  549. administrators need to remove existing backup using <span class="codefrag">bin/hadoop
  550. dfsadmin -finalizeUpgrade</span> command. The following
  551. briefly describes typical upgrade procedure :
  552. </p>
  553. <ul>
  554. <li>
  555. Before upgrading Hadoop software,
  556. <em>finalize</em> if there an existing backup.
  557. <span class="codefrag">dfsadmin -upgradeProgress status</span>
  558. can tell if the cluster needs to be <em>finalized</em>.
  559. </li>
  560. <li>Stop the cluster and distribute new version of Hadoop.</li>
  561. <li>
  562. Run the new version with <span class="codefrag">-upgrade</span> option
  563. (<span class="codefrag">bin/start-dfs.sh -upgrade</span>).
  564. </li>
  565. <li>
  566. Most of the time, cluster works just fine. Once the new HDFS is
  567. considered working well (may be after a few days of operation),
  568. finalize the upgrade. Note that until the cluster is finalized,
  569. deleting the files that existed before the upgrade does not free
  570. up real disk space on the datanodes.
  571. </li>
  572. <li>
  573. If there is a need to move back to the old version,
  574. <ul>
  575. <li> stop the cluster and distribute earlier version of Hadoop. </li>
  576. <li> start the cluster with rollback option.
  577. (<span class="codefrag">bin/start-dfs.h -rollback</span>).
  578. </li>
  579. </ul>
  580. </li>
  581. </ul>
  582. </div>
  583. <a name="N101BF"></a><a name="File+Permissions+and+Security"></a>
  584. <h2 class="h3"> File Permissions and Security </h2>
  585. <div class="section">
  586. <p>
  587. The file permissions are designed to be similar to file permissions on
  588. other familiar platforms like Linux. Currently, security is limited
  589. to simple file permissions. The user that starts Namenode is
  590. treated as the <em>super user</em> for HDFS. Future versions of HDFS will
  591. support network authentication protocols like Kerberos for user
  592. authentication and encryption of data transfers. The details are discussed in the
  593. <a href="hdfs_permissions_guide.html"><em>Permissions User and Administrator Guide</em></a>.
  594. </p>
  595. </div>
  596. <a name="N101D1"></a><a name="Scalability"></a>
  597. <h2 class="h3"> Scalability </h2>
  598. <div class="section">
  599. <p>
  600. Hadoop currently runs on clusters with thousands of nodes.
  601. <a href="http://wiki.apache.org/hadoop/PoweredBy">PoweredBy Hadoop</a>
  602. lists some of the organizations that deploy Hadoop on large
  603. clusters. HDFS has one Namenode for each cluster. Currently
  604. the total memory available on Namenode is the primary scalability
  605. limitation. On very large clusters, increasing average size of
  606. files stored in HDFS helps with increasing cluster size without
  607. increasing memory requirements on Namenode.
  608. The default configuration may not suite very large clustes.
  609. <a href="http://wiki.apache.org/hadoop/FAQ">Hadoop FAQ</a> page lists
  610. suggested configuration improvements for large Hadoop clusters.
  611. </p>
  612. </div>
  613. <a name="N101E3"></a><a name="Related+Documentation"></a>
  614. <h2 class="h3"> Related Documentation </h2>
  615. <div class="section">
  616. <p>
  617. This user guide is intended to be a good starting point for
  618. working with HDFS. While it continues to improve,
  619. there is a large wealth of documentation about Hadoop and HDFS.
  620. The following lists starting points for further exploration :
  621. </p>
  622. <ul>
  623. <li>
  624. <a href="http://hadoop.apache.org/">Hadoop Home Page</a>
  625. : the start page for everything Hadoop.
  626. </li>
  627. <li>
  628. <a href="http://wiki.apache.org/hadoop/FrontPage">Hadoop Wiki</a>
  629. : Front page for Hadoop Wiki documentation. Unlike this
  630. guide which is part of Hadoop source tree, Hadoop Wiki is
  631. regularly edited by Hadoop Community.
  632. </li>
  633. <li>
  634. <a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a> from Hadoop Wiki.
  635. </li>
  636. <li>
  637. Hadoop <a href="http://hadoop.apache.org/core/docs/current/api/">
  638. JavaDoc API</a>.
  639. </li>
  640. <li>
  641. Hadoop User Mailing List :
  642. <a href="mailto:core-user@hadoop.apache.org">core-user[at]hadoop.apache.org</a>.
  643. </li>
  644. <li>
  645. Explore <span class="codefrag">conf/hadoop-default.xml</span>.
  646. It includes brief
  647. description of most of the configuration variables available.
  648. </li>
  649. </ul>
  650. </div>
  651. </div>
  652. <!--+
  653. |end content
  654. +-->
  655. <div class="clearboth">&nbsp;</div>
  656. </div>
  657. <div id="footer">
  658. <!--+
  659. |start bottomstrip
  660. +-->
  661. <div class="lastmodified">
  662. <script type="text/javascript"><!--
  663. document.write("Last Published: " + document.lastModified);
  664. // --></script>
  665. </div>
  666. <div class="copyright">
  667. Copyright &copy;
  668. 2007 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
  669. </div>
  670. <!--+
  671. |end bottomstrip
  672. +-->
  673. </div>
  674. </body>
  675. </html>