17 éve · 6e1e0f759a
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,9 @@ Release 0.16.4 - 2008-05-05
 
															     HADOOP-3294. Fix distcp to check the destination length and retry the copy
														
 
															     if it doesn't match the src length. (Tsz Wo (Nicholas), SZE via mukund)
														
 
															+    HADOOP-3304. [HOD] Fixes the way the logcondense.py utility searches for log
														
 
															+    files that need to be deleted. (yhemanth via mukund)
														
 
															+
														
 
															 Release 0.16.3 - 2008-04-16
														
 
															   BUG FIXES
														
--- a/src/contrib/hod/support/logcondense.py
+++ b/src/contrib/hod/support/logcondense.py
@@ -1,3 +1,5 @@
 
															+#!/bin/sh
														
 
															+
														
 
															 #Licensed to the Apache Software Foundation (ASF) under one
														
 
															 #or more contributor license agreements.  See the NOTICE file
														
 
															 #distributed with this work for additional information
														
@@ -13,7 +15,6 @@
 
															 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															 #See the License for the specific language governing permissions and
														
 
															 #limitations under the License.
														
 
															-#!/bin/sh
														
 
															 """:"
														
 
															 work_dir=$(dirname $0)
														
 
															 base_name=$(basename $0)
														
@@ -84,8 +85,8 @@ options = ( {'short'   : "-p",
 
															 	     'action'  : "store",
														
 
															 	     'dest'    : "log",
														
 
															 	     'metavar' : " ",
														
 
															-	     'default' : "/user/hod/logs",
														
 
															-	     'help'    : "directory where the logs are stored"},
														
 
															+	     'default' : "/user",
														
 
															+	     'help'    : "directory prefix under which logs are stored per user"},
														
 
															 	    {'short'   : "-n",
														
 
															 	     'long'    : "--dynamicdfs",
														
@@ -118,57 +119,64 @@ def runcondense():
 
															     deletedNamePrefixes.append('1-tasktracker-*')
														
 
															     deletedNamePrefixes.append('0-datanode-*')
														
 
															-  cmd = getDfsCommand(options, "-lsr " + options.log)
														
 
															+  filepath = '%s/\*/hod-logs/' % (options.log)
														
 
															+  cmd = getDfsCommand(options, "-lsr " + filepath)
														
 
															   (stdin, stdout, stderr) = popen3(cmd)
														
 
															   lastjobid = 'none'
														
 
															   toPurge = { }
														
 
															   for line in stdout:
														
 
															-    m = re.match("^(.*?)\s.*$", line)
														
 
															-    filename = m.group(1)
														
 
															-    # file name format: <prefix>/<user>/hod-logs/<jobid>/[0-1]-[jobtracker|tasktracker|datanode|namenode|]-hostname-YYYYMMDDtime-random.tar.gz
														
 
															-    # first strip prefix:
														
 
															-    if filename.startswith(options.log):
														
 
															-      filename = filename.lstrip(options.log)
														
 
															-      if not filename.startswith('/'):
														
 
															-        filename = '/' + filename
														
 
															-    else:
														
 
															-      continue
														
 
															-    
														
 
															-    # Now get other details from filename.
														
 
															-    k = re.match("/(.*)/.*/(.*)/.*-.*-([0-9][0-9][0-9][0-9])([0-9][0-9])([0-9][0-9]).*$", filename)
														
 
															-    if k:
														
 
															-      username = k.group(1)
														
 
															-      jobid =  k.group(2)
														
 
															-      datetimefile = datetime(int(k.group(3)), int(k.group(4)), int(k.group(5)))
														
 
															-      datetimenow = datetime.utcnow()
														
 
															-      diff = datetimenow - datetimefile
														
 
															-      filedate = k.group(3) + k.group(4) + k.group(5)
														
 
															-      newdate = datetimenow.strftime("%Y%m%d")
														
 
															-      print "%s %s %s %d" % (filename,  filedate, newdate, diff.days)
														
 
															-      
														
 
															-      # if the cluster is used to bring up dynamic dfs, we must also leave NameNode logs.
														
 
															-      foundFilteredName = False
														
 
															-      for name in filteredNames:
														
 
															-        if filename.find(name) >= 0:
														
 
															-          foundFilteredName = True
														
 
															-          break
														
 
															-
														
 
															-      if foundFilteredName:
														
 
															+    try:
														
 
															+      m = re.match("^(.*?)\s.*$", line)
														
 
															+      filename = m.group(1)
														
 
															+      # file name format: <prefix>/<user>/hod-logs/<jobid>/[0-1]-[jobtracker|tasktracker|datanode|namenode|]-hostname-YYYYMMDDtime-random.tar.gz
														
 
															+      # first strip prefix:
														
 
															+      if filename.startswith(options.log):
														
 
															+        filename = filename.lstrip(options.log)
														
 
															+        if not filename.startswith('/'):
														
 
															+          filename = '/' + filename
														
 
															+      else:
														
 
															         continue
														
 
															-
														
 
															-      if (diff.days > options.days):
														
 
															-        desttodel = filename
														
 
															-        if not toPurge.has_key(jobid):
														
 
															-          toPurge[jobid] = options.log.rstrip("/") + "/" + username + "/hod-logs/" + jobid
														
 
															+    
														
 
															+      # Now get other details from filename.
														
 
															+      k = re.match("/(.*)/hod-logs/(.*)/.*-.*-([0-9][0-9][0-9][0-9])([0-9][0-9])([0-9][0-9]).*$", filename)
														
 
															+      if k:
														
 
															+        username = k.group(1)
														
 
															+        jobid =  k.group(2)
														
 
															+        datetimefile = datetime(int(k.group(3)), int(k.group(4)), int(k.group(5)))
														
 
															+        datetimenow = datetime.utcnow()
														
 
															+        diff = datetimenow - datetimefile
														
 
															+        filedate = k.group(3) + k.group(4) + k.group(5)
														
 
															+        newdate = datetimenow.strftime("%Y%m%d")
														
 
															+        print "%s %s %s %d" % (filename,  filedate, newdate, diff.days)
														
 
															+
														
 
															+        # if the cluster is used to bring up dynamic dfs, we must also leave NameNode logs.
														
 
															+        foundFilteredName = False
														
 
															+        for name in filteredNames:
														
 
															+          if filename.find(name) >= 0:
														
 
															+            foundFilteredName = True
														
 
															+            break
														
 
															+
														
 
															+        if foundFilteredName:
														
 
															+          continue
														
 
															+
														
 
															+        if (diff.days > options.days):
														
 
															+          desttodel = filename
														
 
															+          if not toPurge.has_key(jobid):
														
 
															+            toPurge[jobid] = options.log.rstrip("/") + "/" + username + "/hod-logs/" + jobid
														
 
															+    except Exception, e:
														
 
															+      print >> sys.stderr, e
														
 
															   for job in toPurge.keys():
														
 
															-    for prefix in deletedNamePrefixes:
														
 
															-      cmd = getDfsCommand(options, "-rm " + toPurge[job] + '/' + prefix)
														
 
															-      print cmd
														
 
															-      ret = 0
														
 
															-      ret = os.system(cmd)
														
 
															-      if (ret != 0):
														
 
															-        print >> sys.stderr, "Command failed to delete file " + cmd 
														
 
															+    try:
														
 
															+      for prefix in deletedNamePrefixes:
														
 
															+        cmd = getDfsCommand(options, "-rm " + toPurge[job] + '/' + prefix)
														
 
															+        print cmd
														
 
															+        ret = 0
														
 
															+        ret = os.system(cmd)
														
 
															+        if (ret != 0):
														
 
															+          print >> sys.stderr, "Command failed to delete file " + cmd 
														
 
															+    except Exception, e:
														
 
															+      print >> sys.stderr, e
														
 
															 def process_args():
														
--- a/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml
+++ b/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml
@@ -1,238 +1,318 @@
 
															-<?xml version="1.0"?>

														
 
															-

														
 
															-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"

														
 
															-          "http://forrest.apache.org/dtd/document-v20.dtd">

														
 
															-

														
 
															-

														
 
															-<document>

														
 
															-

														
 
															-  <header>

														
 
															-    <title> 

														
 
															-      Hadoop On Demand

														
 
															-    </title>

														
 
															-  </header>

														
 
															-

														
 
															-  <body>

														
 
															-<section>

														
 
															-<title>Overview</title>

														
 
															-

														
 
															-<p>The Hadoop On Demand (HOD) project is a system for provisioning and

														
 
															-managing independent Hadoop MapReduce and HDFS instances on a shared cluster 

														
 
															-of nodes. HOD is a tool that makes it easy for administrators and users to 

														
 
															-quickly setup and use Hadoop. It is also a very useful tool for Hadoop developers 

														
 
															-and testers who need to share a physical cluster for testing their own Hadoop 

														
 
															-versions.

														
 
															-</p>

														
 
															-

														
 
															-<p>HOD relies on a resource manager (RM) for allocation of nodes that it can use for

														
 
															-running Hadoop instances. At present it runs with the <a href="ext:hod/torque">Torque

														
 
															-resource manager</a>.

														
 
															-</p>

														
 
															-

														
 
															-<p>

														
 
															-The basic system architecture of HOD includes components from:</p>

														
 
															-<ul>

														
 
															-  <li>A Resource manager (possibly together with a scheduler),</li>

														
 
															-  <li>HOD components, and </li>

														
 
															-  <li>Hadoop Map/Reduce and HDFS daemons.</li>

														
 
															-</ul>

														
 
															-

														
 
															-<p>

														
 
															-HOD provisions and maintains Hadoop Map/Reduce and, optionally, HDFS instances 

														
 
															-through interaction with the above components on a given cluster of nodes. A cluster of

														
 
															-nodes can be thought of as comprising of two sets of nodes:</p>

														
 
															-<ul>

														
 
															-  <li>Submit nodes: Users use the HOD client on these nodes to allocate clusters, and then

														
 
															-use the Hadoop client to submit Hadoop jobs. </li>

														
 
															-  <li>Compute nodes: Using the resource manager, HOD components are run on these nodes to 

														
 
															-provision the Hadoop daemons. After that Hadoop jobs run on them.</li>

														
 
															-</ul>

														
 
															-

														
 
															-<p>

														
 
															-Here is a brief description of the sequence of operations in allocating a cluster and

														
 
															-running jobs on them.

														
 
															-</p>

														
 
															-

														
 
															-<ul>

														
 
															-  <li>The user uses the HOD client on the Submit node to allocate a required number of

														
 
															-cluster nodes, and provision Hadoop on them.</li>

														
 
															-  <li>The HOD client uses a Resource Manager interface, (qsub, in Torque), to submit a HOD

														
 
															-process, called the RingMaster, as a Resource Manager job, requesting the user desired number 

														
 
															-of nodes. This job is submitted to the central server of the Resource Manager (pbs_server, in Torque).</li>

														
 
															-  <li>On the compute nodes, the resource manager slave daemons, (pbs_moms in Torque), accept

														
 
															-and run jobs that they are given by the central server (pbs_server in Torque). The RingMaster 

														
 
															-process is started on one of the compute nodes (mother superior, in Torque).</li>

														
 
															-  <li>The Ringmaster then uses another Resource Manager interface, (pbsdsh, in Torque), to run

														
 
															-the second HOD component, HodRing, as distributed tasks on each of the compute

														
 
															-nodes allocated.</li>

														
 
															-  <li>The Hodrings, after initializing, communicate with the Ringmaster to get Hadoop commands, 

														
 
															-and run them accordingly. Once the Hadoop commands are started, they register with the RingMaster,

														
 
															-giving information about the daemons.</li>

														
 
															-  <li>All the configuration files needed for Hadoop instances are generated by HOD itself, 

														
 
															-some obtained from options given by user in its own configuration file.</li>

														
 
															-  <li>The HOD client keeps communicating with the RingMaster to find out the location of the 

														
 
															-JobTracker and HDFS daemons.</li>

														
 
															-</ul>

														
 
															-

														
 
															-<p>The rest of the document deals with the steps needed to setup HOD on a physical cluster of nodes.</p>

														
 
															-

														
 
															-</section>

														
 
															-

														
 
															-<section>

														
 
															-<title>Pre-requisites</title>

														
 
															-

														
 
															-<p>Operating System: HOD is currently tested on RHEL4.<br/>

														
 
															-Nodes : HOD requires a minimum of 3 nodes configured through a resource manager.<br/></p>

														
 
															-

														
 
															-<p> Software </p>

														
 
															-<p>The following components are to be installed on *ALL* the nodes before using HOD:</p>

														
 
															-<ul>

														
 
															- <li>Torque: Resource manager</li>

														
 
															- <li><a href="ext:hod/python">Python</a> : HOD requires version 2.5.1 of Python.</li>

														
 
															-</ul>

														
 
															-

														
 
															-<p>The following components can be optionally installed for getting better

														
 
															-functionality from HOD:</p>

														
 
															-<ul>

														
 
															- <li><a href="ext:hod/twisted-python">Twisted Python</a>: This can be

														
 
															-  used for improving the scalability of HOD. If this module is detected to be

														
 
															-  installed, HOD uses it, else it falls back to default modules.</li>

														
 
															- <li><a href="ext:site">Hadoop</a>: HOD can automatically

														
 
															- distribute Hadoop to all nodes in the cluster. However, it can also use a

														
 
															- pre-installed version of Hadoop, if it is available on all nodes in the cluster.

														
 
															-  HOD currently supports Hadoop 0.15 and above.</li>

														
 
															-</ul>

														
 
															-

														
 
															-<p>NOTE: HOD configuration requires the location of installs of these

														
 
															-components to be the same on all nodes in the cluster. It will also

														
 
															-make the configuration simpler to have the same location on the submit

														
 
															-nodes.

														
 
															-</p>

														
 
															-</section>

														
 
															-

														
 
															-<section>

														
 
															-<title>Resource Manager</title>

														
 
															-<p>  Currently HOD works with the Torque resource manager, which it uses for its node

														
 
															-  allocation and job submission. Torque is an open source resource manager from

														
 
															-  <a href="ext:hod/cluster-resources">Cluster Resources</a>, a community effort

														
 
															-  based on the PBS project. It provides control over batch jobs and distributed compute nodes. Torque is

														
 
															-  freely available for download from <a href="ext:hod/torque-download">here</a>.

														
 
															-  </p>

														
 
															-

														
 
															-<p>  All documentation related to torque can be seen under

														
 
															-  the section TORQUE Resource Manager <a

														
 
															-  href="ext:hod/torque-docs">here</a>. You can

														
 
															-  get wiki documentation from <a

														
 
															-  href="ext:hod/torque-wiki">here</a>.

														
 
															-  Users may wish to subscribe to TORQUE’s mailing list or view the archive for questions,

														
 
															-  comments <a

														
 
															-  href="ext:hod/torque-mailing-list">here</a>.

														
 
															-</p>

														
 
															-

														
 
															-<p>For using HOD with Torque:</p>

														
 
															-<ul>

														
 
															- <li>Install Torque components: pbs_server on one node(head node), pbs_mom on all

														
 
															-  compute nodes, and PBS client tools on all compute nodes and submit

														
 
															-  nodes. Perform atleast a basic configuration so that the Torque system is up and

														
 
															-  running i.e pbs_server knows which machines to talk to. Look <a

														
 
															-  href="ext:hod/torque-basic-config">here</a>

														
 
															-  for basic configuration.

														
 
															-

														
 
															-  For advanced configuration, see <a

														
 
															-  href="ext:hod/torque-advanced-config">here</a></li>

														
 
															- <li>Create a queue for submitting jobs on the pbs_server. The name of the queue is the

														
 
															-  same as the HOD configuration parameter, resource-manager.queue. The Hod client uses this queue to

														
 
															-  submit the Ringmaster process as a Torque job.</li>

														
 
															- <li>Specify a 'cluster name' as a 'property' for all nodes in the cluster.

														
 
															-  This can be done by using the 'qmgr' command. For example:

														
 
															-  qmgr -c "set node node properties=cluster-name". The name of the cluster is the same as

														
 
															-  the HOD configuration parameter, hod.cluster. </li>

														
 
															- <li>Ensure that jobs can be submitted to the nodes. This can be done by

														
 
															-  using the 'qsub' command. For example:

														
 
															-  echo "sleep 30" | qsub -l nodes=3</li>

														
 
															-</ul>

														
 
															-

														
 
															-</section>

														
 
															-

														
 
															-<section>

														
 
															-<title>Installing HOD</title>

														
 
															-

														
 
															-<p>Now that the resource manager set up is done, we proceed on to obtaining and

														
 
															-installing HOD.</p>

														
 
															-<ul>

														
 
															- <li>If you are getting HOD from the Hadoop tarball,it is available under the 

														
 
															-  'contrib' section of Hadoop, under the root  directory 'hod'.</li>

														
 
															- <li>If you are building from source, you can run ant tar from the Hadoop root

														
 
															-  directory, to generate the Hadoop tarball, and then pick HOD from there,

														
 
															-  as described in the point above.</li>

														
 
															- <li>Distribute the files under this directory to all the nodes in the

														
 
															-  cluster. Note that the location where the files are copied should be

														
 
															-  the same on all the nodes.</li>

														
 
															-  <li>Note that compiling hadoop would build HOD with appropriate permissions 

														
 
															-  set on all the required script files in HOD.</li>

														
 
															-</ul>

														
 
															-</section>

														
 
															-

														
 
															-<section>

														
 
															-<title>Configuring HOD</title>

														
 
															-

														
 
															-<p>After HOD installation is done, it has to be configured before we start using

														
 
															-it.</p>

														
 
															-<section>

														
 
															-  <title>Minimal Configuration to get started</title>

														
 
															-<ul>

														
 
															- <li>On the node from where you want to run hod, edit the file hodrc

														
 
															-  which can be found in the &lt;install dir&gt;/conf directory. This file

														
 
															-  contains the minimal set of values required for running hod.</li>

														
 
															- <li>

														
 
															-<p>Specify values suitable to your environment for the following

														
 
															-  variables defined in the configuration file. Note that some of these

														
 
															-  variables are defined at more than one place in the file.</p>

														
 
															-

														
 
															-  <ul>

														
 
															-   <li>${JAVA_HOME}: Location of Java for Hadoop. Hadoop supports Sun JDK

														
 
															-    1.5.x and above.</li>

														
 
															-   <li>${CLUSTER_NAME}: Name of the cluster which is specified in the

														
 
															-    'node property' as mentioned in resource manager configuration.</li>

														
 
															-   <li>${HADOOP_HOME}: Location of Hadoop installation on the compute and

														
 
															-    submit nodes.</li>

														
 
															-   <li>${RM_QUEUE}: Queue configured for submiting jobs in the resource

														
 
															-    manager configuration.</li>

														
 
															-   <li>${RM_HOME}: Location of the resource manager installation on the

														
 
															-    compute and submit nodes.</li>

														
 
															-    </ul>

														
 
															-</li>

														
 
															-

														
 
															-<li>

														
 
															-<p>The following environment variables *may* need to be set depending on

														
 
															-  your environment. These variables must be defined where you run the

														
 
															-  HOD client, and also be specified in the HOD configuration file as the

														
 
															-  value of the key resource_manager.env-vars. Multiple variables can be

														
 
															-  specified as a comma separated list of key=value pairs.</p>

														
 
															-

														
 
															-  <ul>

														
 
															-   <li>HOD_PYTHON_HOME: If you install python to a non-default location

														
 
															-    of the compute nodes, or submit nodes, then, this variable must be

														
 
															-    defined to point to the python executable in the non-standard

														
 
															-    location.</li>

														
 
															-    </ul>

														
 
															-</li>

														
 
															-</ul>

														
 
															-</section>

														
 
															-

														
 
															-  <section>

														
 
															-    <title>Advanced Configuration</title>

														
 
															-    <p> You can review other configuration options in the file and modify them to suit

														
 
															- your needs. Refer to the <a href="hod_config_guide.html">Configuration Guide</a> for information about the HOD

														
 
															- configuration.

														
 
															-    </p>

														
 
															-  </section>

														
 
															-</section>

														
 
															-

														
 
															-  <section>

														
 
															-    <title>Running HOD</title>

														
 
															-    <p>You can now proceed to <a href="hod_user_guide.html">HOD User Guide</a> for information about how to run HOD,

														
 
															-    what are the various features, options and for help in trouble-shooting.</p>

														
 
															-  </section>

														
 
															-</body>

														
 
															-</document>

														
 
															+<?xml version="1.0"?>
														
 
															+
														
 
															+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
														
 
															+          "http://forrest.apache.org/dtd/document-v20.dtd">
														
 
															+
														
 
															+
														
 
															+<document>
														
 
															+
														
 
															+  <header>
														
 
															+    <title> 
														
 
															+      Hadoop On Demand
														
 
															+    </title>
														
 
															+  </header>
														
 
															+
														
 
															+  <body>
														
 
															+<section>
														
 
															+<title>Overview</title>
														
 
															+
														
 
															+<p>The Hadoop On Demand (HOD) project is a system for provisioning and
														
 
															+managing independent Hadoop MapReduce and HDFS instances on a shared cluster 
														
 
															+of nodes. HOD is a tool that makes it easy for administrators and users to 
														
 
															+quickly setup and use Hadoop. It is also a very useful tool for Hadoop developers 
														
 
															+and testers who need to share a physical cluster for testing their own Hadoop 
														
 
															+versions.
														
 
															+</p>
														
 
															+
														
 
															+<p>HOD relies on a resource manager (RM) for allocation of nodes that it can use for
														
 
															+running Hadoop instances. At present it runs with the <a href="ext:hod/torque">Torque
														
 
															+resource manager</a>.
														
 
															+</p>
														
 
															+
														
 
															+<p>
														
 
															+The basic system architecture of HOD includes components from:</p>
														
 
															+<ul>
														
 
															+  <li>A Resource manager (possibly together with a scheduler),</li>
														
 
															+  <li>HOD components, and </li>
														
 
															+  <li>Hadoop Map/Reduce and HDFS daemons.</li>
														
 
															+</ul>
														
 
															+
														
 
															+<p>
														
 
															+HOD provisions and maintains Hadoop Map/Reduce and, optionally, HDFS instances 
														
 
															+through interaction with the above components on a given cluster of nodes. A cluster of
														
 
															+nodes can be thought of as comprising of two sets of nodes:</p>
														
 
															+<ul>
														
 
															+  <li>Submit nodes: Users use the HOD client on these nodes to allocate clusters, and then
														
 
															+use the Hadoop client to submit Hadoop jobs. </li>
														
 
															+  <li>Compute nodes: Using the resource manager, HOD components are run on these nodes to 
														
 
															+provision the Hadoop daemons. After that Hadoop jobs run on them.</li>
														
 
															+</ul>
														
 
															+
														
 
															+<p>
														
 
															+Here is a brief description of the sequence of operations in allocating a cluster and
														
 
															+running jobs on them.
														
 
															+</p>
														
 
															+
														
 
															+<ul>
														
 
															+  <li>The user uses the HOD client on the Submit node to allocate a required number of
														
 
															+cluster nodes, and provision Hadoop on them.</li>
														
 
															+  <li>The HOD client uses a Resource Manager interface, (qsub, in Torque), to submit a HOD
														
 
															+process, called the RingMaster, as a Resource Manager job, requesting the user desired number 
														
 
															+of nodes. This job is submitted to the central server of the Resource Manager (pbs_server, in Torque).</li>
														
 
															+  <li>On the compute nodes, the resource manager slave daemons, (pbs_moms in Torque), accept
														
 
															+and run jobs that they are given by the central server (pbs_server in Torque). The RingMaster 
														
 
															+process is started on one of the compute nodes (mother superior, in Torque).</li>
														
 
															+  <li>The Ringmaster then uses another Resource Manager interface, (pbsdsh, in Torque), to run
														
 
															+the second HOD component, HodRing, as distributed tasks on each of the compute
														
 
															+nodes allocated.</li>
														
 
															+  <li>The Hodrings, after initializing, communicate with the Ringmaster to get Hadoop commands, 
														
 
															+and run them accordingly. Once the Hadoop commands are started, they register with the RingMaster,
														
 
															+giving information about the daemons.</li>
														
 
															+  <li>All the configuration files needed for Hadoop instances are generated by HOD itself, 
														
 
															+some obtained from options given by user in its own configuration file.</li>
														
 
															+  <li>The HOD client keeps communicating with the RingMaster to find out the location of the 
														
 
															+JobTracker and HDFS daemons.</li>
														
 
															+</ul>
														
 
															+
														
 
															+<p>The rest of the document deals with the steps needed to setup HOD on a physical cluster of nodes.</p>
														
 
															+
														
 
															+</section>
														
 
															+
														
 
															+<section>
														
 
															+<title>Pre-requisites</title>
														
 
															+
														
 
															+<p>Operating System: HOD is currently tested on RHEL4.<br/>
														
 
															+Nodes : HOD requires a minimum of 3 nodes configured through a resource manager.<br/></p>
														
 
															+
														
 
															+<p> Software </p>
														
 
															+<p>The following components are to be installed on *ALL* the nodes before using HOD:</p>
														
 
															+<ul>
														
 
															+ <li>Torque: Resource manager</li>
														
 
															+ <li><a href="ext:hod/python">Python</a> : HOD requires version 2.5.1 of Python.</li>
														
 
															+</ul>
														
 
															+
														
 
															+<p>The following components can be optionally installed for getting better
														
 
															+functionality from HOD:</p>
														
 
															+<ul>
														
 
															+ <li><a href="ext:hod/twisted-python">Twisted Python</a>: This can be
														
 
															+  used for improving the scalability of HOD. If this module is detected to be
														
 
															+  installed, HOD uses it, else it falls back to default modules.</li>
														
 
															+ <li><a href="ext:site">Hadoop</a>: HOD can automatically
														
 
															+ distribute Hadoop to all nodes in the cluster. However, it can also use a
														
 
															+ pre-installed version of Hadoop, if it is available on all nodes in the cluster.
														
 
															+  HOD currently supports Hadoop 0.15 and above.</li>
														
 
															+</ul>
														
 
															+
														
 
															+<p>NOTE: HOD configuration requires the location of installs of these
														
 
															+components to be the same on all nodes in the cluster. It will also
														
 
															+make the configuration simpler to have the same location on the submit
														
 
															+nodes.
														
 
															+</p>
														
 
															+</section>
														
 
															+
														
 
															+<section>
														
 
															+<title>Resource Manager</title>
														
 
															+<p>  Currently HOD works with the Torque resource manager, which it uses for its node
														
 
															+  allocation and job submission. Torque is an open source resource manager from
														
 
															+  <a href="ext:hod/cluster-resources">Cluster Resources</a>, a community effort
														
 
															+  based on the PBS project. It provides control over batch jobs and distributed compute nodes. Torque is
														
 
															+  freely available for download from <a href="ext:hod/torque-download">here</a>.
														
 
															+  </p>
														
 
															+
														
 
															+<p>  All documentation related to torque can be seen under
														
 
															+  the section TORQUE Resource Manager <a
														
 
															+  href="ext:hod/torque-docs">here</a>. You can
														
 
															+  get wiki documentation from <a
														
 
															+  href="ext:hod/torque-wiki">here</a>.
														
 
															+  Users may wish to subscribe to TORQUE’s mailing list or view the archive for questions,
														
 
															+  comments <a
														
 
															+  href="ext:hod/torque-mailing-list">here</a>.
														
 
															+</p>
														
 
															+
														
 
															+<p>For using HOD with Torque:</p>
														
 
															+<ul>
														
 
															+ <li>Install Torque components: pbs_server on one node(head node), pbs_mom on all
														
 
															+  compute nodes, and PBS client tools on all compute nodes and submit
														
 
															+  nodes. Perform atleast a basic configuration so that the Torque system is up and
														
 
															+  running i.e pbs_server knows which machines to talk to. Look <a
														
 
															+  href="ext:hod/torque-basic-config">here</a>
														
 
															+  for basic configuration.
														
 
															+
														
 
															+  For advanced configuration, see <a
														
 
															+  href="ext:hod/torque-advanced-config">here</a></li>
														
 
															+ <li>Create a queue for submitting jobs on the pbs_server. The name of the queue is the
														
 
															+  same as the HOD configuration parameter, resource-manager.queue. The Hod client uses this queue to
														
 
															+  submit the Ringmaster process as a Torque job.</li>
														
 
															+ <li>Specify a 'cluster name' as a 'property' for all nodes in the cluster.
														
 
															+  This can be done by using the 'qmgr' command. For example:
														
 
															+  qmgr -c "set node node properties=cluster-name". The name of the cluster is the same as
														
 
															+  the HOD configuration parameter, hod.cluster. </li>
														
 
															+ <li>Ensure that jobs can be submitted to the nodes. This can be done by
														
 
															+  using the 'qsub' command. For example:
														
 
															+  echo "sleep 30" | qsub -l nodes=3</li>
														
 
															+</ul>
														
 
															+
														
 
															+</section>
														
 
															+
														
 
															+<section>
														
 
															+<title>Installing HOD</title>
														
 
															+
														
 
															+<p>Now that the resource manager set up is done, we proceed on to obtaining and
														
 
															+installing HOD.</p>
														
 
															+<ul>
														
 
															+ <li>If you are getting HOD from the Hadoop tarball,it is available under the 
														
 
															+  'contrib' section of Hadoop, under the root  directory 'hod'.</li>
														
 
															+ <li>If you are building from source, you can run ant tar from the Hadoop root
														
 
															+  directory, to generate the Hadoop tarball, and then pick HOD from there,
														
 
															+  as described in the point above.</li>
														
 
															+ <li>Distribute the files under this directory to all the nodes in the
														
 
															+  cluster. Note that the location where the files are copied should be
														
 
															+  the same on all the nodes.</li>
														
 
															+  <li>Note that compiling hadoop would build HOD with appropriate permissions 
														
 
															+  set on all the required script files in HOD.</li>
														
 
															+</ul>
														
 
															+</section>
														
 
															+
														
 
															+<section>
														
 
															+<title>Configuring HOD</title>
														
 
															+
														
 
															+<p>After HOD installation is done, it has to be configured before we start using
														
 
															+it.</p>
														
 
															+<section>
														
 
															+  <title>Minimal Configuration to get started</title>
														
 
															+<ul>
														
 
															+ <li>On the node from where you want to run hod, edit the file hodrc
														
 
															+  which can be found in the &lt;install dir&gt;/conf directory. This file
														
 
															+  contains the minimal set of values required for running hod.</li>
														
 
															+ <li>
														
 
															+<p>Specify values suitable to your environment for the following
														
 
															+  variables defined in the configuration file. Note that some of these
														
 
															+  variables are defined at more than one place in the file.</p>
														
 
															+
														
 
															+  <ul>
														
 
															+   <li>${JAVA_HOME}: Location of Java for Hadoop. Hadoop supports Sun JDK
														
 
															+    1.5.x and above.</li>
														
 
															+   <li>${CLUSTER_NAME}: Name of the cluster which is specified in the
														
 
															+    'node property' as mentioned in resource manager configuration.</li>
														
 
															+   <li>${HADOOP_HOME}: Location of Hadoop installation on the compute and
														
 
															+    submit nodes.</li>
														
 
															+   <li>${RM_QUEUE}: Queue configured for submiting jobs in the resource
														
 
															+    manager configuration.</li>
														
 
															+   <li>${RM_HOME}: Location of the resource manager installation on the
														
 
															+    compute and submit nodes.</li>
														
 
															+    </ul>
														
 
															+</li>
														
 
															+
														
 
															+<li>
														
 
															+<p>The following environment variables *may* need to be set depending on
														
 
															+  your environment. These variables must be defined where you run the
														
 
															+  HOD client, and also be specified in the HOD configuration file as the
														
 
															+  value of the key resource_manager.env-vars. Multiple variables can be
														
 
															+  specified as a comma separated list of key=value pairs.</p>
														
 
															+
														
 
															+  <ul>
														
 
															+   <li>HOD_PYTHON_HOME: If you install python to a non-default location
														
 
															+    of the compute nodes, or submit nodes, then, this variable must be
														
 
															+    defined to point to the python executable in the non-standard
														
 
															+    location.</li>
														
 
															+    </ul>
														
 
															+</li>
														
 
															+</ul>
														
 
															+</section>
														
 
															+
														
 
															+  <section>
														
 
															+    <title>Advanced Configuration</title>
														
 
															+    <p> You can review other configuration options in the file and modify them to suit
														
 
															+ your needs. Refer to the <a href="hod_config_guide.html">Configuration Guide</a> for information about the HOD
														
 
															+ configuration.
														
 
															+    </p>
														
 
															+  </section>
														
 
															+</section>
														
 
															+
														
 
															+  <section>
														
 
															+    <title>Running HOD</title>
														
 
															+    <p>You can now proceed to <a href="hod_user_guide.html">HOD User Guide</a> for information about how to run HOD,
														
 
															+    what are the various features, options and for help in trouble-shooting.</p>
														
 
															+  </section>
														
 
															+
														
 
															+  <section>
														
 
															+    <title>Supporting Tools and Utilities</title>
														
 
															+    <p>This section describes certain supporting tools and utilities that can be used in managing HOD deployments.</p>
														
 
															+    
														
 
															+    <section>
														
 
															+      <title>logcondense.py - Tool for removing log files uploaded to DFS</title>
														
 
															+      <p>As mentioned in 
														
 
															+         <a href="hod_user_guide.html#Collecting+and+Viewing+Hadoop+Logs">this section</a> of the
														
 
															+         <a href="hod_user_guide.html">HOD User Guide</a>, HOD can be configured to upload
														
 
															+         Hadoop logs to a statically configured HDFS. Over time, the number of logs uploaded
														
 
															+         to DFS could increase. logcondense.py is a tool that helps administrators to clean-up
														
 
															+         the log files older than a certain number of days. </p>
														
 
															+      <section>
														
 
															+        <title>Running logcondense.py</title>
														
 
															+        <p>logcondense.py is available under hod_install_location/support folder. You can either
														
 
															+        run it using python, for e.g. <em>python logcondense.py</em>, or give execute permissions 
														
 
															+        to the file, and directly run it as <em>logcondense.py</em>. logcondense.py needs to be 
														
 
															+        run by a user who has sufficient permissions to remove files from locations where log 
														
 
															+        files are uploaded in the DFS, if permissions are enabled. For e.g. as mentioned in the
														
 
															+        <a href="hod_config_guide.html#3.7+hodring+options">configuration guide</a>, the logs could
														
 
															+        be configured to come under the user's home directory in HDFS. In that case, the user
														
 
															+        running logcondense.py should have super user privileges to remove the files from under
														
 
															+        all user home directories.</p>
														
 
															+      </section>
														
 
															+      <section>
														
 
															+        <title>Command Line Options for logcondense.py</title>
														
 
															+        <p>The following command line options are supported for logcondense.py.</p>
														
 
															+          <table>
														
 
															+            <tr>
														
 
															+              <td>Short Option</td>
														
 
															+              <td>Long option</td>
														
 
															+              <td>Meaning</td>
														
 
															+              <td>Example</td>
														
 
															+            </tr>
														
 
															+            <tr>
														
 
															+              <td>-p</td>
														
 
															+              <td>--package</td>
														
 
															+              <td>Complete path to the hadoop script. The version of hadoop must be the same as the 
														
 
															+                  one running HDFS.</td>
														
 
															+              <td>/usr/bin/hadoop</td>
														
 
															+            </tr>
														
 
															+            <tr>
														
 
															+              <td>-d</td>
														
 
															+              <td>--days</td>
														
 
															+              <td>Delete log files older than the specified number of days</td>
														
 
															+              <td>7</td>
														
 
															+            </tr>
														
 
															+            <tr>
														
 
															+              <td>-c</td>
														
 
															+              <td>--config</td>
														
 
															+              <td>Path to the Hadoop configuration directory, under which hadoop-site.xml resides.
														
 
															+              The hadoop-site.xml must point to the HDFS NameNode from where logs are to be removed.</td>
														
 
															+              <td>/home/foo/hadoop/conf</td>
														
 
															+            </tr>
														
 
															+            <tr>
														
 
															+              <td>-l</td>
														
 
															+              <td>--logs</td>
														
 
															+              <td>A HDFS path, this must be the same HDFS path as specified for the log-destination-uri,
														
 
															+              as mentioned in the  <a href="hod_config_guide.html#3.7+hodring+options">configuration guide</a>,
														
 
															+              without the hdfs:// URI string</td>
														
 
															+              <td>/user</td>
														
 
															+            </tr>
														
 
															+            <tr>
														
 
															+              <td>-n</td>
														
 
															+              <td>--dynamicdfs</td>
														
 
															+              <td>If true, this will indicate that the logcondense.py script should delete HDFS logs
														
 
															+              in addition to Map/Reduce logs. Otherwise, it only deletes Map/Reduce logs, which is also the
														
 
															+              default if this option is not specified. This option is useful if dynamic DFS installations 
														
 
															+              are being provisioned by HOD, and the static DFS installation is being used only to collect 
														
 
															+              logs - a scenario that may be common in test clusters.</td>
														
 
															+              <td>false</td>
														
 
															+            </tr>
														
 
															+          </table>
														
 
															+        <p>So, for example, to delete all log files older than 7 days using a hadoop-site.xml stored in
														
 
															+        ~/hadoop-conf, using the hadoop installation under ~/hadoop-0.17.0, you could say:</p>
														
 
															+        <p><em>python logcondense.py -p ~/hadoop-0.17.0/bin/hadoop -d 7 -c ~/hadoop-conf -l /user</em></p>
														
 
															+      </section>
														
 
															+    </section>
														
 
															+  </section>
														
 
															+</body>
														
 
															+</document>