hod_admin_guide.html 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721
  1. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  2. <html>
  3. <head>
  4. <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
  5. <meta content="Apache Forrest" name="Generator">
  6. <meta name="Forrest-version" content="0.8">
  7. <meta name="Forrest-skin-name" content="pelt">
  8. <title>
  9. HOD Administrator Guide
  10. </title>
  11. <link type="text/css" href="skin/basic.css" rel="stylesheet">
  12. <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
  13. <link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
  14. <link type="text/css" href="skin/profile.css" rel="stylesheet">
  15. <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
  16. <link rel="shortcut icon" href="images/favicon.ico">
  17. </head>
  18. <body onload="init()">
  19. <script type="text/javascript">ndeSetTextSize();</script>
  20. <div id="top">
  21. <!--+
  22. |breadtrail
  23. +-->
  24. <div class="breadtrail">
  25. <a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/core/">Core</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
  26. </div>
  27. <!--+
  28. |header
  29. +-->
  30. <div class="header">
  31. <!--+
  32. |start group logo
  33. +-->
  34. <div class="grouplogo">
  35. <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
  36. </div>
  37. <!--+
  38. |end group logo
  39. +-->
  40. <!--+
  41. |start Project Logo
  42. +-->
  43. <div class="projectlogo">
  44. <a href="http://hadoop.apache.org/core/"><img class="logoImage" alt="Hadoop" src="images/core-logo.gif" title="Scalable Computing Platform"></a>
  45. </div>
  46. <!--+
  47. |end Project Logo
  48. +-->
  49. <!--+
  50. |start Search
  51. +-->
  52. <div class="searchbox">
  53. <form action="http://www.google.com/search" method="get" class="roundtopsmall">
  54. <input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
  55. <input name="Search" value="Search" type="submit">
  56. </form>
  57. </div>
  58. <!--+
  59. |end search
  60. +-->
  61. <!--+
  62. |start Tabs
  63. +-->
  64. <ul id="tabs">
  65. <li>
  66. <a class="unselected" href="http://hadoop.apache.org/core/">Project</a>
  67. </li>
  68. <li>
  69. <a class="unselected" href="http://wiki.apache.org/hadoop">Wiki</a>
  70. </li>
  71. <li class="current">
  72. <a class="selected" href="index.html">Hadoop 0.19 Documentation</a>
  73. </li>
  74. </ul>
  75. <!--+
  76. |end Tabs
  77. +-->
  78. </div>
  79. </div>
  80. <div id="main">
  81. <div id="publishedStrip">
  82. <!--+
  83. |start Subtabs
  84. +-->
  85. <div id="level2tabs"></div>
  86. <!--+
  87. |end Endtabs
  88. +-->
  89. <script type="text/javascript"><!--
  90. document.write("Last Published: " + document.lastModified);
  91. // --></script>
  92. </div>
  93. <!--+
  94. |breadtrail
  95. +-->
  96. <div class="breadtrail">
  97. &nbsp;
  98. </div>
  99. <!--+
  100. |start Menu, mainarea
  101. +-->
  102. <!--+
  103. |start Menu
  104. +-->
  105. <div id="menu">
  106. <div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Documentation</div>
  107. <div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
  108. <div class="menuitem">
  109. <a href="index.html">Overview</a>
  110. </div>
  111. <div class="menuitem">
  112. <a href="quickstart.html">Hadoop Quick Start</a>
  113. </div>
  114. <div class="menuitem">
  115. <a href="cluster_setup.html">Hadoop Cluster Setup</a>
  116. </div>
  117. <div class="menuitem">
  118. <a href="mapred_tutorial.html">Hadoop Map/Reduce Tutorial</a>
  119. </div>
  120. <div class="menuitem">
  121. <a href="commands_manual.html">Hadoop Command Guide</a>
  122. </div>
  123. <div class="menuitem">
  124. <a href="hdfs_shell.html">Hadoop FS Shell Guide</a>
  125. </div>
  126. <div class="menuitem">
  127. <a href="distcp.html">Hadoop DistCp Guide</a>
  128. </div>
  129. <div class="menuitem">
  130. <a href="native_libraries.html">Hadoop Native Libraries</a>
  131. </div>
  132. <div class="menuitem">
  133. <a href="streaming.html">Hadoop Streaming</a>
  134. </div>
  135. <div class="menuitem">
  136. <a href="hadoop_archives.html">Hadoop Archives</a>
  137. </div>
  138. <div class="menuitem">
  139. <a href="hdfs_user_guide.html">HDFS User Guide</a>
  140. </div>
  141. <div class="menuitem">
  142. <a href="hdfs_design.html">HDFS Architecture</a>
  143. </div>
  144. <div class="menuitem">
  145. <a href="hdfs_permissions_guide.html">HDFS Admin Guide: Permissions</a>
  146. </div>
  147. <div class="menuitem">
  148. <a href="hdfs_quota_admin_guide.html">HDFS Admin Guide: Quotas</a>
  149. </div>
  150. <div class="menuitem">
  151. <a href="SLG_user_guide.html">HDFS Utilities</a>
  152. </div>
  153. <div class="menuitem">
  154. <a href="hod_user_guide.html">HOD User Guide</a>
  155. </div>
  156. <div class="menupage">
  157. <div class="menupagetitle">HOD Admin Guide</div>
  158. </div>
  159. <div class="menuitem">
  160. <a href="hod_config_guide.html">HOD Config Guide</a>
  161. </div>
  162. <div class="menuitem">
  163. <a href="capacity_scheduler.html">Capacity Scheduler</a>
  164. </div>
  165. <div class="menuitem">
  166. <a href="api/index.html">API Docs</a>
  167. </div>
  168. <div class="menuitem">
  169. <a href="jdiff/changes.html">API Changes</a>
  170. </div>
  171. <div class="menuitem">
  172. <a href="http://wiki.apache.org/hadoop/">Wiki</a>
  173. </div>
  174. <div class="menuitem">
  175. <a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a>
  176. </div>
  177. <div class="menuitem">
  178. <a href="releasenotes.html">Release Notes</a>
  179. </div>
  180. <div class="menuitem">
  181. <a href="changes.html">Change Log</a>
  182. </div>
  183. </div>
  184. <div id="credit"></div>
  185. <div id="roundbottom">
  186. <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
  187. <!--+
  188. |alternative credits
  189. +-->
  190. <div id="credit2"></div>
  191. </div>
  192. <!--+
  193. |end Menu
  194. +-->
  195. <!--+
  196. |start content
  197. +-->
  198. <div id="content">
  199. <div title="Portable Document Format" class="pdflink">
  200. <a class="dida" href="hod_admin_guide.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
  201. PDF</a>
  202. </div>
  203. <h1>
  204. HOD Administrator Guide
  205. </h1>
  206. <div id="minitoc-area">
  207. <ul class="minitoc">
  208. <li>
  209. <a href="#Overview">Overview</a>
  210. </li>
  211. <li>
  212. <a href="#Pre-requisites">Pre-requisites</a>
  213. </li>
  214. <li>
  215. <a href="#Resource+Manager">Resource Manager</a>
  216. </li>
  217. <li>
  218. <a href="#Installing+HOD">Installing HOD</a>
  219. </li>
  220. <li>
  221. <a href="#Configuring+HOD">Configuring HOD</a>
  222. <ul class="minitoc">
  223. <li>
  224. <a href="#Minimal+Configuration">Minimal Configuration</a>
  225. </li>
  226. <li>
  227. <a href="#Advanced+Configuration">Advanced Configuration</a>
  228. </li>
  229. </ul>
  230. </li>
  231. <li>
  232. <a href="#Running+HOD">Running HOD</a>
  233. </li>
  234. <li>
  235. <a href="#Supporting+Tools+and+Utilities">Supporting Tools and Utilities</a>
  236. <ul class="minitoc">
  237. <li>
  238. <a href="#logcondense.py+-+Manage+Log+Files">logcondense.py - Manage Log Files</a>
  239. <ul class="minitoc">
  240. <li>
  241. <a href="#Running+logcondense.py">Running logcondense.py</a>
  242. </li>
  243. <li>
  244. <a href="#Command+Line+Options+for+logcondense.py">Command Line Options for logcondense.py</a>
  245. </li>
  246. </ul>
  247. </li>
  248. <li>
  249. <a href="#checklimits.sh+-+Monitor+Resource+Limits">checklimits.sh - Monitor Resource Limits</a>
  250. <ul class="minitoc">
  251. <li>
  252. <a href="#Running+checklimits.sh">Running checklimits.sh</a>
  253. </li>
  254. </ul>
  255. </li>
  256. <li>
  257. <a href="#verify-account+-+Script+to+verify+an+account+under+which+%0A+++++++++++++jobs+are+submitted">verify-account - Script to verify an account under which
  258. jobs are submitted</a>
  259. <ul class="minitoc">
  260. <li>
  261. <a href="#Integrating+the+verify-account+script+with+HOD">Integrating the verify-account script with HOD</a>
  262. </li>
  263. </ul>
  264. </li>
  265. </ul>
  266. </li>
  267. </ul>
  268. </div>
  269. <a name="N1000C"></a><a name="Overview"></a>
  270. <h2 class="h3">Overview</h2>
  271. <div class="section">
  272. <p>Hadoop On Demand (HOD) is a system for provisioning and
  273. managing independent Hadoop Map/Reduce and Hadoop Distributed File System (HDFS)
  274. instances on a shared cluster
  275. of nodes. HOD is a tool that makes it easy for administrators and users to
  276. quickly setup and use Hadoop. HOD is also a very useful tool for Hadoop developers
  277. and testers who need to share a physical cluster for testing their own Hadoop
  278. versions.
  279. </p>
  280. <p>HOD relies on a resource manager (RM) for allocation of nodes that it can use for
  281. running Hadoop instances. At present it runs with the <a href="http://www.clusterresources.com/pages/products/torque-resource-manager.php">Torque
  282. resource manager</a>.
  283. </p>
  284. <p>
  285. The basic system architecture of HOD includes these components:</p>
  286. <ul>
  287. <li>A Resource manager (possibly together with a scheduler)</li>
  288. <li>Various HOD components</li>
  289. <li>Hadoop Map/Reduce and HDFS daemons</li>
  290. </ul>
  291. <p>
  292. HOD provisions and maintains Hadoop Map/Reduce and, optionally, HDFS instances
  293. through interaction with the above components on a given cluster of nodes. A cluster of
  294. nodes can be thought of as comprising two sets of nodes:</p>
  295. <ul>
  296. <li>Submit nodes: Users use the HOD client on these nodes to allocate clusters, and then
  297. use the Hadoop client to submit Hadoop jobs. </li>
  298. <li>Compute nodes: Using the resource manager, HOD components are run on these nodes to
  299. provision the Hadoop daemons. After that Hadoop jobs run on them.</li>
  300. </ul>
  301. <p>
  302. Here is a brief description of the sequence of operations in allocating a cluster and
  303. running jobs on them.
  304. </p>
  305. <ul>
  306. <li>The user uses the HOD client on the Submit node to allocate a desired number of
  307. cluster nodes and to provision Hadoop on them.</li>
  308. <li>The HOD client uses a resource manager interface (qsub, in Torque) to submit a HOD
  309. process, called the RingMaster, as a Resource Manager job, to request the user's desired number
  310. of nodes. This job is submitted to the central server of the resource manager (pbs_server, in Torque).</li>
  311. <li>On the compute nodes, the resource manager slave daemons (pbs_moms in Torque) accept
  312. and run jobs that they are assigned by the central server (pbs_server in Torque). The RingMaster
  313. process is started on one of the compute nodes (mother superior, in Torque).</li>
  314. <li>The RingMaster then uses another resource manager interface (pbsdsh, in Torque) to run
  315. the second HOD component, HodRing, as distributed tasks on each of the compute
  316. nodes allocated.</li>
  317. <li>The HodRings, after initializing, communicate with the RingMaster to get Hadoop commands,
  318. and run them accordingly. Once the Hadoop commands are started, they register with the RingMaster,
  319. giving information about the daemons.</li>
  320. <li>All the configuration files needed for Hadoop instances are generated by HOD itself,
  321. some obtained from options given by user in its own configuration file.</li>
  322. <li>The HOD client keeps communicating with the RingMaster to find out the location of the
  323. JobTracker and HDFS daemons.</li>
  324. </ul>
  325. <p>This guide shows you how to get started using HOD, reviews various HOD features and command line options, and provides detailed troubleshooting help.</p>
  326. </div>
  327. <a name="N10056"></a><a name="Pre-requisites"></a>
  328. <h2 class="h3">Pre-requisites</h2>
  329. <div class="section">
  330. <p>To use HOD, your system should include the following hardware and software
  331. components.</p>
  332. <p>Operating System: HOD is currently tested on RHEL4.<br>
  333. Nodes : HOD requires a minimum of three nodes configured through a resource manager.<br>
  334. </p>
  335. <p> Software </p>
  336. <p>The following components must be installed on ALL nodes before using HOD:</p>
  337. <ul>
  338. <li>
  339. <a href="http://www.clusterresources.com/pages/products/torque-resource-manager.php">Torque: Resource manager</a>
  340. </li>
  341. <li>
  342. <a href="http://www.python.org">Python</a> : HOD requires version 2.5.1 of Python.</li>
  343. </ul>
  344. <p>The following components are optional and can be installed to obtain better
  345. functionality from HOD:</p>
  346. <ul>
  347. <li>
  348. <a href="http://twistedmatrix.com/trac/">Twisted Python</a>: This can be
  349. used for improving the scalability of HOD. If this module is detected to be
  350. installed, HOD uses it, else it falls back to default modules.</li>
  351. <li>
  352. <a href="http://hadoop.apache.org/core/">Hadoop</a>: HOD can automatically
  353. distribute Hadoop to all nodes in the cluster. However, it can also use a
  354. pre-installed version of Hadoop, if it is available on all nodes in the cluster.
  355. HOD currently supports Hadoop 0.15 and above.</li>
  356. </ul>
  357. <p>NOTE: HOD configuration requires the location of installs of these
  358. components to be the same on all nodes in the cluster. It will also
  359. make the configuration simpler to have the same location on the submit
  360. nodes.
  361. </p>
  362. </div>
  363. <a name="N1008F"></a><a name="Resource+Manager"></a>
  364. <h2 class="h3">Resource Manager</h2>
  365. <div class="section">
  366. <p> Currently HOD works with the Torque resource manager, which it uses for its node
  367. allocation and job submission. Torque is an open source resource manager from
  368. <a href="http://www.clusterresources.com">Cluster Resources</a>, a community effort
  369. based on the PBS project. It provides control over batch jobs and distributed compute nodes. Torque is
  370. freely available for download from <a href="http://www.clusterresources.com/downloads/torque/">here</a>.
  371. </p>
  372. <p> All documentation related to torque can be seen under
  373. the section TORQUE Resource Manager <a href="http://www.clusterresources.com/pages/resources/documentation.php">here</a>. You can
  374. get wiki documentation from <a href="http://www.clusterresources.com/wiki/doku.php?id=torque:torque_wiki">here</a>.
  375. Users may wish to subscribe to TORQUE&rsquo;s mailing list or view the archive for questions,
  376. comments <a href="http://www.clusterresources.com/pages/resources/mailing-lists.php">here</a>.
  377. </p>
  378. <p>To use HOD with Torque:</p>
  379. <ul>
  380. <li>Install Torque components: pbs_server on one node (head node), pbs_mom on all
  381. compute nodes, and PBS client tools on all compute nodes and submit
  382. nodes. Perform at least a basic configuration so that the Torque system is up and
  383. running, that is, pbs_server knows which machines to talk to. Look <a href="http://www.clusterresources.com/wiki/doku.php?id=torque:1.2_basic_configuration">here</a>
  384. for basic configuration.
  385. For advanced configuration, see <a href="http://www.clusterresources.com/wiki/doku.php?id=torque:1.3_advanced_configuration">here</a>
  386. </li>
  387. <li>Create a queue for submitting jobs on the pbs_server. The name of the queue is the
  388. same as the HOD configuration parameter, resource-manager.queue. The HOD client uses this queue to
  389. submit the RingMaster process as a Torque job.</li>
  390. <li>Specify a cluster name as a property for all nodes in the cluster.
  391. This can be done by using the qmgr command. For example:
  392. <span class="codefrag">qmgr -c "set node node properties=cluster-name"</span>. The name of the cluster is the same as
  393. the HOD configuration parameter, hod.cluster. </li>
  394. <li>Make sure that jobs can be submitted to the nodes. This can be done by
  395. using the qsub command. For example:
  396. <span class="codefrag">echo "sleep 30" | qsub -l nodes=3</span>
  397. </li>
  398. </ul>
  399. </div>
  400. <a name="N100CE"></a><a name="Installing+HOD"></a>
  401. <h2 class="h3">Installing HOD</h2>
  402. <div class="section">
  403. <p>Once the resource manager is set up, you can obtain and
  404. install HOD.</p>
  405. <ul>
  406. <li>If you are getting HOD from the Hadoop tarball, it is available under the
  407. 'contrib' section of Hadoop, under the root directory 'hod'.</li>
  408. <li>If you are building from source, you can run ant tar from the Hadoop root
  409. directory to generate the Hadoop tarball, and then get HOD from there,
  410. as described above.</li>
  411. <li>Distribute the files under this directory to all the nodes in the
  412. cluster. Note that the location where the files are copied should be
  413. the same on all the nodes.</li>
  414. <li>Note that compiling hadoop would build HOD with appropriate permissions
  415. set on all the required script files in HOD.</li>
  416. </ul>
  417. </div>
  418. <a name="N100E7"></a><a name="Configuring+HOD"></a>
  419. <h2 class="h3">Configuring HOD</h2>
  420. <div class="section">
  421. <p>You can configure HOD once it is installed. The minimal configuration needed
  422. to run HOD is described below. More advanced configuration options are discussed
  423. in the HOD Configuration Guide.</p>
  424. <a name="N100F0"></a><a name="Minimal+Configuration"></a>
  425. <h3 class="h4">Minimal Configuration</h3>
  426. <p>To get started using HOD, the following minimal configuration is
  427. required:</p>
  428. <ul>
  429. <li>On the node from where you want to run HOD, edit the file hodrc
  430. located in the &lt;install dir&gt;/conf directory. This file
  431. contains the minimal set of values required to run hod.</li>
  432. <li>
  433. <p>Specify values suitable to your environment for the following
  434. variables defined in the configuration file. Note that some of these
  435. variables are defined at more than one place in the file.</p>
  436. <ul>
  437. <li>${JAVA_HOME}: Location of Java for Hadoop. Hadoop supports Sun JDK
  438. 1.6.x and above.</li>
  439. <li>${CLUSTER_NAME}: Name of the cluster which is specified in the
  440. 'node property' as mentioned in resource manager configuration.</li>
  441. <li>${HADOOP_HOME}: Location of Hadoop installation on the compute and
  442. submit nodes.</li>
  443. <li>${RM_QUEUE}: Queue configured for submitting jobs in the resource
  444. manager configuration.</li>
  445. <li>${RM_HOME}: Location of the resource manager installation on the
  446. compute and submit nodes.</li>
  447. </ul>
  448. </li>
  449. <li>
  450. <p>The following environment variables may need to be set depending on
  451. your environment. These variables must be defined where you run the
  452. HOD client and must also be specified in the HOD configuration file as the
  453. value of the key resource_manager.env-vars. Multiple variables can be
  454. specified as a comma separated list of key=value pairs.</p>
  455. <ul>
  456. <li>HOD_PYTHON_HOME: If you install python to a non-default location
  457. of the compute nodes, or submit nodes, then this variable must be
  458. defined to point to the python executable in the non-standard
  459. location.</li>
  460. </ul>
  461. </li>
  462. </ul>
  463. <a name="N10124"></a><a name="Advanced+Configuration"></a>
  464. <h3 class="h4">Advanced Configuration</h3>
  465. <p> You can review and modify other configuration options to suit
  466. your specific needs. Refer to the <a href="hod_config_guide.html">HOD Configuration
  467. Guide</a> for more information.</p>
  468. </div>
  469. <a name="N10133"></a><a name="Running+HOD"></a>
  470. <h2 class="h3">Running HOD</h2>
  471. <div class="section">
  472. <p>You can run HOD once it is configured. Refer to the<a href="hod_user_guide.html"> HOD User Guide</a> for more information.</p>
  473. </div>
  474. <a name="N10141"></a><a name="Supporting+Tools+and+Utilities"></a>
  475. <h2 class="h3">Supporting Tools and Utilities</h2>
  476. <div class="section">
  477. <p>This section describes supporting tools and utilities that can be used to
  478. manage HOD deployments.</p>
  479. <a name="N1014A"></a><a name="logcondense.py+-+Manage+Log+Files"></a>
  480. <h3 class="h4">logcondense.py - Manage Log Files</h3>
  481. <p>As mentioned in the
  482. <a href="hod_user_guide.html#Collecting+and+Viewing+Hadoop+Logs">HOD User Guide</a>,
  483. HOD can be configured to upload
  484. Hadoop logs to a statically configured HDFS. Over time, the number of logs uploaded
  485. to HDFS could increase. logcondense.py is a tool that helps
  486. administrators to remove log files uploaded to HDFS. </p>
  487. <a name="N10157"></a><a name="Running+logcondense.py"></a>
  488. <h4>Running logcondense.py</h4>
  489. <p>logcondense.py is available under hod_install_location/support folder. You can either
  490. run it using python, for example, <em>python logcondense.py</em>, or give execute permissions
  491. to the file, and directly run it as <em>logcondense.py</em>. logcondense.py needs to be
  492. run by a user who has sufficient permissions to remove files from locations where log
  493. files are uploaded in the HDFS, if permissions are enabled. For example as mentioned in the
  494. <a href="hod_config_guide.html#3.7+hodring+options">HOD Configuration Guide</a>, the logs could
  495. be configured to come under the user's home directory in HDFS. In that case, the user
  496. running logcondense.py should have super user privileges to remove the files from under
  497. all user home directories.</p>
  498. <a name="N1016B"></a><a name="Command+Line+Options+for+logcondense.py"></a>
  499. <h4>Command Line Options for logcondense.py</h4>
  500. <p>The following command line options are supported for logcondense.py.</p>
  501. <table class="ForrestTable" cellspacing="1" cellpadding="4">
  502. <tr>
  503. <td colspan="1" rowspan="1">Short Option</td>
  504. <td colspan="1" rowspan="1">Long option</td>
  505. <td colspan="1" rowspan="1">Meaning</td>
  506. <td colspan="1" rowspan="1">Example</td>
  507. </tr>
  508. <tr>
  509. <td colspan="1" rowspan="1">-p</td>
  510. <td colspan="1" rowspan="1">--package</td>
  511. <td colspan="1" rowspan="1">Complete path to the hadoop script. The version of hadoop must be the same as the
  512. one running HDFS.</td>
  513. <td colspan="1" rowspan="1">/usr/bin/hadoop</td>
  514. </tr>
  515. <tr>
  516. <td colspan="1" rowspan="1">-d</td>
  517. <td colspan="1" rowspan="1">--days</td>
  518. <td colspan="1" rowspan="1">Delete log files older than the specified number of days</td>
  519. <td colspan="1" rowspan="1">7</td>
  520. </tr>
  521. <tr>
  522. <td colspan="1" rowspan="1">-c</td>
  523. <td colspan="1" rowspan="1">--config</td>
  524. <td colspan="1" rowspan="1">Path to the Hadoop configuration directory, under which hadoop-site.xml resides.
  525. The hadoop-site.xml must point to the HDFS NameNode from where logs are to be removed.</td>
  526. <td colspan="1" rowspan="1">/home/foo/hadoop/conf</td>
  527. </tr>
  528. <tr>
  529. <td colspan="1" rowspan="1">-l</td>
  530. <td colspan="1" rowspan="1">--logs</td>
  531. <td colspan="1" rowspan="1">A HDFS path, this must be the same HDFS path as specified for the log-destination-uri,
  532. as mentioned in the <a href="hod_config_guide.html#3.7+hodring+options">HOD Configuration Guide</a>,
  533. without the hdfs:// URI string</td>
  534. <td colspan="1" rowspan="1">/user</td>
  535. </tr>
  536. <tr>
  537. <td colspan="1" rowspan="1">-n</td>
  538. <td colspan="1" rowspan="1">--dynamicdfs</td>
  539. <td colspan="1" rowspan="1">If true, this will indicate that the logcondense.py script should delete HDFS logs
  540. in addition to Map/Reduce logs. Otherwise, it only deletes Map/Reduce logs, which is also the
  541. default if this option is not specified. This option is useful if
  542. dynamic HDFS installations
  543. are being provisioned by HOD, and the static HDFS installation is being used only to collect
  544. logs - a scenario that may be common in test clusters.</td>
  545. <td colspan="1" rowspan="1">false</td>
  546. </tr>
  547. </table>
  548. <p>So, for example, to delete all log files older than 7 days using a hadoop-site.xml stored in
  549. ~/hadoop-conf, using the hadoop installation under ~/hadoop-0.17.0, you could say:</p>
  550. <p>
  551. <em>python logcondense.py -p ~/hadoop-0.17.0/bin/hadoop -d 7 -c ~/hadoop-conf -l /user</em>
  552. </p>
  553. <a name="N1020E"></a><a name="checklimits.sh+-+Monitor+Resource+Limits"></a>
  554. <h3 class="h4">checklimits.sh - Monitor Resource Limits</h3>
  555. <p>checklimits.sh is a HOD tool specific to the Torque/Maui environment
  556. (<a href="http://www.clusterresources.com/pages/products/maui-cluster-scheduler.php">Maui Cluster Scheduler</a> is an open source job
  557. scheduler for clusters and supercomputers, from clusterresources). The
  558. checklimits.sh script
  559. updates the torque comment field when newly submitted job(s) violate or
  560. exceed
  561. over user limits set up in Maui scheduler. It uses qstat, does one pass
  562. over the torque job-list to determine queued or unfinished jobs, runs Maui
  563. tool checkjob on each job to see if user limits are violated and then
  564. runs torque's qalter utility to update job attribute 'comment'. Currently
  565. it updates the comment as <em>User-limits exceeded. Requested:([0-9]*)
  566. Used:([0-9]*) MaxLimit:([0-9]*)</em> for those jobs that violate limits.
  567. This comment field is then used by HOD to behave accordingly depending on
  568. the type of violation.</p>
  569. <a name="N1021E"></a><a name="Running+checklimits.sh"></a>
  570. <h4>Running checklimits.sh</h4>
  571. <p>checklimits.sh is available under the hod_install_location/support
  572. folder. This shell script can be run directly as <em>sh
  573. checklimits.sh </em>or as <em>./checklimits.sh</em> after enabling
  574. execute permissions. Torque and Maui binaries should be available
  575. on the machine where the tool is run and should be in the path
  576. of the shell script process. To update the
  577. comment field of jobs from different users, this tool must be run with
  578. torque administrative privileges. This tool must be run repeatedly
  579. after specific intervals of time to frequently update jobs violating
  580. constraints, for example via cron. Please note that the resource manager
  581. and scheduler commands used in this script can be expensive and so
  582. it is better not to run this inside a tight loop without sleeping.</p>
  583. <a name="N1022F"></a><a name="verify-account+-+Script+to+verify+an+account+under+which+%0A+++++++++++++jobs+are+submitted"></a>
  584. <h3 class="h4">verify-account - Script to verify an account under which
  585. jobs are submitted</h3>
  586. <p>Production systems use accounting packages to charge users for using
  587. shared compute resources. HOD supports a parameter
  588. <em>resource_manager.pbs-account</em> to allow users to identify the
  589. account under which they would like to submit jobs. It may be necessary
  590. to verify that this account is a valid one configured in an accounting
  591. system. The <em>hod-install-dir/bin/verify-account</em> script
  592. provides a mechanism to plug-in a custom script that can do this
  593. verification.</p>
  594. <a name="N1023E"></a><a name="Integrating+the+verify-account+script+with+HOD"></a>
  595. <h4>Integrating the verify-account script with HOD</h4>
  596. <p>HOD runs the <em>verify-account</em> script passing in the
  597. <em>resource_manager.pbs-account</em> value as argument to the script,
  598. before allocating a cluster. Sites can write a script that verify this
  599. account against their accounting systems. Returning a non-zero exit
  600. code from this script will cause HOD to fail allocation. Also, in
  601. case of an error, HOD will print the output of script to the user.
  602. Any descriptive error message can be passed to the user from the
  603. script in this manner.</p>
  604. <p>The default script that comes with the HOD installation does not
  605. do any validation, and returns a zero exit code.</p>
  606. <p>If the verify-account script is not found, then HOD will treat
  607. that verification is disabled, and continue allocation as is.</p>
  608. </div>
  609. </div>
  610. <!--+
  611. |end content
  612. +-->
  613. <div class="clearboth">&nbsp;</div>
  614. </div>
  615. <div id="footer">
  616. <!--+
  617. |start bottomstrip
  618. +-->
  619. <div class="lastmodified">
  620. <script type="text/javascript"><!--
  621. document.write("Last Published: " + document.lastModified);
  622. // --></script>
  623. </div>
  624. <div class="copyright">
  625. Copyright &copy;
  626. 2008 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
  627. </div>
  628. <!--+
  629. |end bottomstrip
  630. +-->
  631. </div>
  632. </body>
  633. </html>