17 gadi atpakaļ · 5dae068144
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -591,6 +591,9 @@ Trunk (unreleased changes)
 
															     HADOOP-2576. Namenode performance degradation over time triggered by
														
 
															     large heartbeat interval. (Raghu Angadi)
														
 
															+    HADOOP-2720. Jumbo bug fix patch to HOD.  Final sync of Apache SVN with
														
 
															+    internal Yahoo SVN.  (Hemanth Yamijala via nigel)
														
 
															+
														
 
															 Release 0.15.3 - 2008-01-18
														
 
															   BUG FIXES
														
--- a/src/contrib/hod/bin/VERSION
+++ b/src/contrib/hod/bin/VERSION
@@ -1,16 +1 @@
 
															-#Licensed to the Apache Software Foundation (ASF) under one
														
 
															-#or more contributor license agreements.  See the NOTICE file
														
 
															-#distributed with this work for additional information
														
 
															-#regarding copyright ownership.  The ASF licenses this file
														
 
															-#to you under the Apache License, Version 2.0 (the
														
 
															-#"License"); you may not use this file except in compliance
														
 
															-#with the License.  You may obtain a copy of the License at
														
 
															-
														
 
															-#     http://www.apache.org/licenses/LICENSE-2.0
														
 
															-
														
 
															-#Unless required by applicable law or agreed to in writing, software
														
 
															-#distributed under the License is distributed on an "AS IS" BASIS,
														
 
															-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															-#See the License for the specific language governing permissions and
														
 
															-#limitations under the License.
														
 
															-DEVELOPMENT
														
 
															+0.4.0
														
--- a/src/contrib/hod/bin/hod
+++ b/src/contrib/hod/bin/hod
@@ -45,7 +45,9 @@ sys.path.append(libDirectory)
 
															 from hodlib.Hod.hod import hodRunner
														
 
															 from hodlib.Common.setup import *
														
 
															 from hodlib.Common.descGenerator import *
														
 
															-from hodlib.Common.util import local_fqdn, need_to_allocate, filter_warnings, get_exception_error_string
														
 
															+from hodlib.Common.util import local_fqdn, need_to_allocate, filter_warnings,\
														
 
															+    get_exception_error_string, hodInterrupt, \
														
 
															+    HOD_INTERRUPTED_MESG, HOD_INTERRUPTED_CODE
														
 
															 from hodlib.Common.tcp import tcpError, tcpSocket
														
 
															 filter_warnings()
														
@@ -91,7 +93,8 @@ defList = { 'hod' : (
 
															               False, True, False, True, 's'),
														
 
															              ('min-nodes', 'pos_int', 
														
 
															-              'Minimum number of nodes to allocate at startup.',
														
 
															+              'Minimum number of nodes to allocate at startup. ' + \
														
 
															+              'Used with hod.script option',
														
 
															               True, None, False, True, 'm'),
														
 
															              ('script', 'file', 'Hadoop script to execute.',
														
@@ -124,10 +127,25 @@ defList = { 'hod' : (
 
															               False, None, True, True),
														
 
															              ('client-params', 'keyval', 'Hadoop client xml key/value list',
														
 
															-              False, None, False, True, 'C'), 
														
 
															+              True, None, False, True, 'C'), 
														
 
															              ('hadoop-ui-log-dir', 'directory', 'Directory to store Web UI Logs of Hadoop',
														
 
															-              False, None, False, True)),
														
 
															+              True, None, False, True),
														
 
															+
														
 
															+             ('temp-dir', 'directory', 'HOD temporary directories.',
														
 
															+              False, None, True, False),
														
 
															+
														
 
															+             ('update-worker-info', 'bool', 'Specifies whether to update Worker Info after allocation',
														
 
															+              False, False, False, True),
														
 
															+
														
 
															+             ('title', 'string', 'Title for the current HOD allocation.',
														
 
															+               True, "HOD", False, True, 'N'),
														
 
															+
														
 
															+             ('walltime', 'pos_int', 'Walltime in seconds for the current HOD allocation',
														
 
															+              True, None, False, True),
														
 
															+
														
 
															+             ('script-wait-time', 'pos_int', 'Specifies the time to wait before running the script. Used with the hod.script option.',
														
 
															+              True, 10, False, True, 'W')),
														
 
															             'resource_manager' : (
														
 
															              ('id', 'string', 'Batch scheduler ID: torque|condor.',
														
@@ -137,7 +155,7 @@ defList = { 'hod' : (
 
															               False, None, False, True),
														
 
															              ('pbs-account', 'string', 'User Account jobs are submitted under.',
														
 
															-              True, pwd.getpwuid(os.getuid())[0], False, False, 'A'),
														
 
															+              True, None, False, False, 'A'),
														
 
															              ('queue', 'string', 'Queue of the batch scheduler to query.',
														
 
															               True, 'batch', False, True, 'Q'),
														
@@ -215,7 +233,7 @@ defList = { 'hod' : (
 
															               False, None, False, False),
														
 
															              ('server-params', 'keyval', 'Hadoop xml key/value list',
														
 
															-              False, None, False, True, 'M'),
														
 
															+              True, None, False, True, 'M'),
														
 
															              ('envs', 'keyval', 'environment to run this package in',
														
 
															               False, None, False, False),
														
@@ -344,140 +362,151 @@ def op_requires_pkgs(config):
 
															     return config['hod'].has_key('script')
														
 
															 if __name__ == '__main__':  
														
 
															-  confDef = definition()
														
 
															-  confDef.add_defs(defList, defOrder)
														
 
															-  hodOptions = options(confDef, "./%s -c <CONFIG_FILE> [OPTIONS]" % myName,
														
 
															-                       VERSION, withConfig=True, defaultConfig=DEFAULT_CONFIG)
														
 
															-
														
 
															-  # hodConfig is a dict like object, hodConfig[section][name]
														
 
															   try:
														
 
															-    hodConfig = config(hodOptions['config'], configDef=confDef, 
														
 
															-                     originalDir=hodOptions['hod']['original-dir'],
														
 
															-                     options=hodOptions) 
														
 
															-  except IOError, e:
														
 
															-    print >>sys.stderr,"error: %s not found. Specify the path to the HOD configuration file, or define the environment variable %s under which a file named hodrc can be found." % (hodOptions['config'], 'HOD_CONF_DIR')
														
 
															-    sys.exit(1)
														
 
															-
														
 
															-  status = True
														
 
															-  statusMsgs = []
														
 
															-
														
 
															-  (status,statusMsgs) = hodConfig.verify()
														
 
															-  if not status:
														
 
															-    print >>sys.stderr,"error: bin/hod failed to start."
														
 
															-    for msg in statusMsgs:
														
 
															-      print >>sys.stderr,"%s" % (msg)
														
 
															-    sys.exit(1)
														
 
															-
														
 
															-  ## TODO : should move the dependency verification to hodConfig.verify
														
 
															-  if hodConfig['hod'].has_key('script') \
														
 
															-    and not hodConfig['hod'].has_key('min-nodes'):
														
 
															-    printErrors(hodConfig.var_error('hod', 'min-nodes',
														
 
															-        "hod.min-nodes must be specified when using hod.script option."))
														
 
															-    sys.exit(1)
														
 
															-
														
 
															-  if hodConfig['hod'].has_key('min-nodes'):
														
 
															-    if hodConfig['hod']['min-nodes'] < 3:
														
 
															-      printErrors(hodConfig.var_error('hod', 'min-nodes',
														
 
															-        "hod.min-nodes must be >= 3 nodes: %s." % 
														
 
															-        hodConfig['hod']['min-nodes']))
														
 
															+    confDef = definition()
														
 
															+    confDef.add_defs(defList, defOrder)
														
 
															+    hodOptions = options(confDef, "./%s -c <CONFIG_FILE> [OPTIONS]" % myName,
														
 
															+                         VERSION, withConfig=True, defaultConfig=DEFAULT_CONFIG)
														
 
															+  
														
 
															+    # hodConfig is a dict like object, hodConfig[section][name]
														
 
															+    try:
														
 
															+      hodConfig = config(hodOptions['config'], configDef=confDef, 
														
 
															+                       originalDir=hodOptions['hod']['original-dir'],
														
 
															+                       options=hodOptions) 
														
 
															+    except IOError, e:
														
 
															+      print >>sys.stderr,"error: %s not found. Specify the path to the HOD configuration file, or define the environment variable %s under which a file named hodrc can be found." % (hodOptions['config'], 'HOD_CONF_DIR')
														
 
															       sys.exit(1)
														
 
															-  if hodConfig['hod'].has_key('operation') and \
														
 
															-    hodConfig['hod'].has_key('script'):
														
 
															-    print "Script execution and hod operations are mutually exclusive."
														
 
															-    hodOptions.print_help(sys.stderr)
														
 
															-    sys.exit(1)
														
 
															+    status = True
														
 
															+    statusMsgs = []
														
 
															-  if 'operation' not in hodConfig['hod'] and 'script' not in hodConfig['hod']:
														
 
															-    print "HOD requires at least a script or operation be specified."
														
 
															-    hodOptions.print_help(sys.stderr)
														
 
															-    sys.exit(1)    
														
 
															+    (status,statusMsgs) = hodConfig.verify()
														
 
															+    if not status:
														
 
															+      print >>sys.stderr,"error: bin/hod failed to start."
														
 
															+      for msg in statusMsgs:
														
 
															+        print >>sys.stderr,"%s" % (msg)
														
 
															+      sys.exit(1)
														
 
															-  if hodConfig['gridservice-hdfs']['external']:
														
 
															-    hdfsAddress = "%s:%s" % (hodConfig['gridservice-hdfs']['host'], 
														
 
															-                             hodConfig['gridservice-hdfs']['fs_port'])
														
 
															-
														
 
															-    hdfsSocket = tcpSocket(hdfsAddress)
														
 
															-      
														
 
															-    try:
														
 
															-      hdfsSocket.open()
														
 
															-      hdfsSocket.close()
														
 
															-    except tcpError:
														
 
															-      printErrors(hodConfig.var_error('hod', 'gridservice-hdfs', 
														
 
															-        "Failed to open a connection to external hdfs address: %s." % 
														
 
															-        hdfsAddress))
														
 
															+    ## TODO : should move the dependency verification to hodConfig.verify
														
 
															+    if hodConfig['hod'].has_key('script') \
														
 
															+      and not hodConfig['hod'].has_key('min-nodes'):
														
 
															+      printErrors(hodConfig.var_error('hod', 'min-nodes',
														
 
															+          "hod.min-nodes must be specified when using hod.script option."))
														
 
															       sys.exit(1)
														
 
															-  else:
														
 
															-    hodConfig['gridservice-hdfs']['host'] = 'localhost'
														
 
															-
														
 
															-  if hodConfig['gridservice-mapred']['external']:
														
 
															-    mapredAddress = "%s:%s" % (hodConfig['gridservice-mapred']['host'], 
														
 
															-                               hodConfig['gridservice-mapred']['tracker_port'])
														
 
															-
														
 
															-    mapredSocket = tcpSocket(mapredAddress)
														
 
															-      
														
 
															-    try:
														
 
															-      mapredSocket.open()
														
 
															-      mapredSocket.close()
														
 
															-    except tcpError:
														
 
															-      printErrors(hodConfig.var_error('hod', 'gridservice-mapred', 
														
 
															-        "Failed to open a connection to external mapred address: %s." % 
														
 
															-        mapredAddress))
														
 
															+  
														
 
															+    if hodConfig['hod'].has_key('min-nodes'):
														
 
															+      if hodConfig['hod']['min-nodes'] < 3:
														
 
															+        printErrors(hodConfig.var_error('hod', 'min-nodes',
														
 
															+          "hod.min-nodes must be >= 3 nodes: %s." % 
														
 
															+          hodConfig['hod']['min-nodes']))
														
 
															+        sys.exit(1)
														
 
															+    
														
 
															+    if hodConfig['hod'].has_key('operation') and \
														
 
															+      hodConfig['hod'].has_key('script'):
														
 
															+      print "Script execution and hod operations are mutually exclusive."
														
 
															+      hodOptions.print_help(sys.stderr)
														
 
															       sys.exit(1)
														
 
															-  else:
														
 
															-    hodConfig['gridservice-mapred']['host'] = 'localhost'
														
 
															-
														
 
															-  if not hodConfig['ringmaster'].has_key('hadoop-tar-ball') and \
														
 
															-    not hodConfig['gridservice-hdfs'].has_key('pkgs') and \
														
 
															-    op_requires_pkgs(hodConfig):
														
 
															-    printErrors(hodConfig.var_error('gridservice-hdfs', 'pkgs', 
														
 
															-      "gridservice-hdfs.pkgs must be defined if ringmaster.hadoop-tar-ball "
														
 
															-      + "is not defined."))
														
 
															-    sys.exit(1)
														
 
															-
														
 
															-  if not hodConfig['ringmaster'].has_key('hadoop-tar-ball') and \
														
 
															-    not hodConfig['gridservice-mapred'].has_key('pkgs') and \
														
 
															-    op_requires_pkgs(hodConfig):
														
 
															-    printErrors(hodConfig.var_error('gridservice-mapred', 'pkgs', 
														
 
															-      "gridservice-mapred.pkgs must be defined if ringmaster.hadoop-tar-ball "
														
 
															-      + "is not defined."))
														
 
															-    sys.exit(1)
														
 
															-
														
 
															-  if hodConfig['hodring'].has_key('log-destination-uri'):
														
 
															-    if hodConfig['hodring']['log-destination-uri'].startswith('file://'):
														
 
															-      pass
														
 
															-    elif hodConfig['hodring']['log-destination-uri'].startswith('hdfs://'):
														
 
															-      hostPort = hodConfig['hodring']['log-destination-uri'][7:].split("/")
														
 
															-      hostPort = hostPort[0]
														
 
															-      socket = tcpSocket(hostPort)
														
 
															+    
														
 
															+    if 'operation' not in hodConfig['hod'] and 'script' not in hodConfig['hod']:
														
 
															+      print "HOD requires at least a script or operation be specified."
														
 
															+      hodOptions.print_help(sys.stderr)
														
 
															+      sys.exit(1)    
														
 
															+    
														
 
															+    if hodConfig['gridservice-hdfs']['external']:
														
 
															+      hdfsAddress = "%s:%s" % (hodConfig['gridservice-hdfs']['host'], 
														
 
															+                               hodConfig['gridservice-hdfs']['fs_port'])
														
 
															+  
														
 
															+      hdfsSocket = tcpSocket(hdfsAddress)
														
 
															+        
														
 
															       try:
														
 
															-        socket.open()
														
 
															-        socket.close()
														
 
															-      except:
														
 
															-        printErrors(hodConfig.var_error('hodring', 'log-destination-uri', 
														
 
															-        "Unable to contact host/port specified in log destination uri: %s" % 
														
 
															-        hodConfig['hodring']['log-destination-uri']))
														
 
															+        hdfsSocket.open()
														
 
															+        hdfsSocket.close()
														
 
															+      except tcpError:
														
 
															+        printErrors(hodConfig.var_error('hod', 'gridservice-hdfs', 
														
 
															+          "Failed to open a connection to external hdfs address: %s." % 
														
 
															+          hdfsAddress))
														
 
															         sys.exit(1)
														
 
															     else:
														
 
															-      printErrors(hodConfig.var_error('hodring', 'log-destination-uri', 
														
 
															-        "The log destiniation uri must be of type local:// or hdfs://."))
														
 
															+      hodConfig['gridservice-hdfs']['host'] = 'localhost'
														
 
															+  
														
 
															+    if hodConfig['gridservice-mapred']['external']:
														
 
															+      mapredAddress = "%s:%s" % (hodConfig['gridservice-mapred']['host'], 
														
 
															+                                 hodConfig['gridservice-mapred']['tracker_port'])
														
 
															+  
														
 
															+      mapredSocket = tcpSocket(mapredAddress)
														
 
															+        
														
 
															+      try:
														
 
															+        mapredSocket.open()
														
 
															+        mapredSocket.close()
														
 
															+      except tcpError:
														
 
															+        printErrors(hodConfig.var_error('hod', 'gridservice-mapred', 
														
 
															+          "Failed to open a connection to external mapred address: %s." % 
														
 
															+          mapredAddress))
														
 
															+        sys.exit(1)
														
 
															+    else:
														
 
															+      hodConfig['gridservice-mapred']['host'] = 'localhost'
														
 
															+  
														
 
															+    if not hodConfig['ringmaster'].has_key('hadoop-tar-ball') and \
														
 
															+      not hodConfig['gridservice-hdfs'].has_key('pkgs') and \
														
 
															+      op_requires_pkgs(hodConfig):
														
 
															+      printErrors(hodConfig.var_error('gridservice-hdfs', 'pkgs', 
														
 
															+        "gridservice-hdfs.pkgs must be defined if ringmaster.hadoop-tar-ball "
														
 
															+        + "is not defined."))
														
 
															+      sys.exit(1)
														
 
															+  
														
 
															+    if not hodConfig['ringmaster'].has_key('hadoop-tar-ball') and \
														
 
															+      not hodConfig['gridservice-mapred'].has_key('pkgs') and \
														
 
															+      op_requires_pkgs(hodConfig):
														
 
															+      printErrors(hodConfig.var_error('gridservice-mapred', 'pkgs', 
														
 
															+        "gridservice-mapred.pkgs must be defined if ringmaster.hadoop-tar-ball "
														
 
															+        + "is not defined."))
														
 
															       sys.exit(1)
														
 
															-  ## TODO : end of should move the dependency verification to hodConfig.verif
														
 
															-    
														
 
															-  hodConfig['hod']['base-dir'] = rootDirectory
														
 
															-  hodConfig['hod']['user_state'] = DEFAULT_HOD_DIR
														
 
															-
														
 
															-  dGen = DescGenerator(hodConfig)
														
 
															-  hodConfig = dGen.initializeDesc()
														
 
															-  os.environ['JAVA_HOME'] = hodConfig['hod']['java-home']
														
 
															+    if hodConfig['hodring'].has_key('log-destination-uri'):
														
 
															+      if hodConfig['hodring']['log-destination-uri'].startswith('file://'):
														
 
															+        pass
														
 
															+      elif hodConfig['hodring']['log-destination-uri'].startswith('hdfs://'):
														
 
															+        hostPort = hodConfig['hodring']['log-destination-uri'][7:].split("/")
														
 
															+        hostPort = hostPort[0]
														
 
															+        socket = tcpSocket(hostPort)
														
 
															+        try:
														
 
															+          socket.open()
														
 
															+          socket.close()
														
 
															+        except:
														
 
															+          printErrors(hodConfig.var_error('hodring', 'log-destination-uri', 
														
 
															+          "Unable to contact host/port specified in log destination uri: %s" % 
														
 
															+          hodConfig['hodring']['log-destination-uri']))
														
 
															+          sys.exit(1)
														
 
															+      else:
														
 
															+        printErrors(hodConfig.var_error('hodring', 'log-destination-uri', 
														
 
															+          "The log destiniation uri must be of type local:// or hdfs://."))
														
 
															+        sys.exit(1)
														
 
															+  
														
 
															+    ## TODO : end of should move the dependency verification to hodConfig.verif
														
 
															+      
														
 
															+    hodConfig['hod']['base-dir'] = rootDirectory
														
 
															+    hodConfig['hod']['user_state'] = DEFAULT_HOD_DIR
														
 
															+  
														
 
															+    dGen = DescGenerator(hodConfig)
														
 
															+    hodConfig = dGen.initializeDesc()
														
 
															+    
														
 
															+    os.environ['JAVA_HOME'] = hodConfig['hod']['java-home']
														
 
															+    
														
 
															+    if hodConfig['hod']['debug'] == 4:
														
 
															+      print ""
														
 
															+      print "Using Python: %s" % sys.version
														
 
															+      print ""
														
 
															+   
														
 
															+    hod = hodRunner(hodConfig)
														
 
															-  if hodConfig['hod']['debug'] == 4:
														
 
															-    print ""
														
 
															-    print "Using Python: %s" % sys.version
														
 
															-    print ""
														
 
															+    # Initiate signal handling
														
 
															+    hodInterrupt.set_log(hod.get_logger())
														
 
															+    hodInterrupt.init_signals()
														
 
															+    # Interrupts set up. Now on we handle signals only when we wish to.
														
 
															+  except KeyboardInterrupt:
														
 
															+    print HOD_INTERRUPTED_MESG
														
 
															+    sys.exit(HOD_INTERRUPTED_CODE)
														
 
															-  hod = hodRunner(hodConfig)
														
 
															   if hodConfig['hod'].has_key('script'):
														
 
															     sys.exit(hod.script())
														
 
															   else:  
														
--- a/src/contrib/hod/bin/hodring
+++ b/src/contrib/hod/bin/hodring
@@ -188,6 +188,102 @@ if __name__ == '__main__':
 
															     service = HodRing(hodRingOptions)
														
 
															     service.start()
														
 
															     service.wait()
														
 
															+   
														
 
															+    if service.log:
														
 
															+      log = service.log
														
 
															+    else: 
														
 
															+      log = getLogger(hodRingOptions)
														
 
															+
														
 
															+    list = []
														
 
															+    
														
 
															+    runningHadoops = service.getRunningValues()
														
 
															+      
														
 
															+    for cmd in runningHadoops:
														
 
															+      log.debug("addding %s to cleanup list..." % cmd)
														
 
															+      cmd.addCleanup(list)
														
 
															+    
														
 
															+    list.append(service.getTempDir())
														
 
															+    log.debug(list)
														
 
															+       
														
 
															+    # archive_logs now
														
 
															+    cmdString = os.path.join(rootDirectory, "bin", "hodcleanup") # same python
														
 
															+
														
 
															+    if (len(runningHadoops) == 0):
														
 
															+      log.info("len(runningHadoops) == 0, No running cluster?")
														
 
															+      log.info("Skipping __copy_archive_to_dfs")
														
 
															+      hadoopString = ""
														
 
															+    else: hadoopString=runningHadoops[0].path
														
 
															+
														
 
															+    #construct the arguments
														
 
															+    if hodRingOptions['hodring'].has_key('log-destination-uri'):
														
 
															+      cmdString = cmdString + " --log-destination-uri " \
														
 
															+                    + hodRingOptions['hodring']['log-destination-uri']
														
 
															+
														
 
															+    hadoopLogDirs = service.getHadoopLogDirs()
														
 
															+    if hadoopLogDirs:
														
 
															+      cmdString = cmdString \
														
 
															+                    + " --hadoop-log-dirs " \
														
 
															+                    + ",".join(hadoopLogDirs)
														
 
															+
														
 
															+    cmdString = cmdString \
														
 
															+                  + " --temp-dir " \
														
 
															+                  + service._cfg['temp-dir'] \
														
 
															+                  + " --hadoop-command-string " \
														
 
															+                  + hadoopString \
														
 
															+                  + " --user-id " \
														
 
															+                  + service._cfg['userid'] \
														
 
															+                  + " --service-id " \
														
 
															+                  + service._cfg['service-id'] \
														
 
															+                  + " --hodring-debug " \
														
 
															+                  + str(hodRingOptions['hodring']['debug']) \
														
 
															+                  + " --hodring-log-dir " \
														
 
															+                  + hodRingOptions['hodring']['log-dir'] \
														
 
															+                  + " --hodring-cleanup-list " \
														
 
															+                  + ",".join(list)
														
 
															+
														
 
															+    if hodRingOptions['hodring'].has_key('syslog-address'):
														
 
															+      cmdString = cmdString + " --hodring-syslog-address " \
														
 
															+                + hodRingOptions['hodring']['syslog-address']
														
 
															+    if service._cfg.has_key('pkgs'):
														
 
															+      cmdString = cmdString + " --pkgs " + service._cfg['pkgs']
														
 
															+
														
 
															+    log.info("cleanup commandstring : ")
														
 
															+    log.info(cmdString)
														
 
															+
														
 
															+    # clean up
														
 
															+    cmd = ['/bin/sh', '-c', cmdString]
														
 
															+
														
 
															+    mswindows = (sys.platform == "win32")
														
 
															+    originalcwd = os.getcwd()
														
 
															+
														
 
															+    if not mswindows:
														
 
															+      try: 
														
 
															+        pid = os.fork() 
														
 
															+        if pid > 0:
														
 
															+          # exit first parent
														
 
															+          log.info("child(pid: %s) is now doing cleanup" % pid)
														
 
															+          sys.exit(0) 
														
 
															+      except OSError, e: 
														
 
															+        log.error("fork failed: %d (%s)" % (e.errno, e.strerror)) 
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+      # decouple from parent environment
														
 
															+      os.chdir("/") 
														
 
															+      os.setsid() 
														
 
															+      os.umask(0) 
														
 
															+ 
														
 
															+    MAXFD = 128 # more than enough file descriptors to close. Just in case.
														
 
															+    for i in xrange(0, MAXFD):
														
 
															+      try:
														
 
															+        os.close(i)
														
 
															+      except OSError:
														
 
															+        pass
														
 
															+  
														
 
															+    try:
														
 
															+      os.execvp(cmd[0], cmd)
														
 
															+    finally:
														
 
															+      log.critical("exec failed")
														
 
															+      os._exit(1)
														
 
															   except Exception:
														
 
															     if service:
														
@@ -195,4 +291,4 @@ if __name__ == '__main__':
 
															         log = service.log
														
 
															     else:
														
 
															       log = getLogger(hodRingOptions)
														
 
															-    log.error("bin/hodring failed to start. %s. \nStack trace:\n%s" %(get_exception_error_string(),get_exception_string()))
														
 
															+    log.error("Error in bin/hodring %s. \nStack trace:\n%s" %(get_exception_error_string(),get_exception_string()))
														
--- a/src/contrib/hod/bin/ringmaster
+++ b/src/contrib/hod/bin/ringmaster
@@ -122,7 +122,7 @@ defList = { 'ringmaster' : (
 
															               False, None, False, True),    
														
 
															              ('pbs-account', 'string', 'User Account jobs are submitted under.',
														
 
															-              False, None, True, False),
														
 
															+              False, None, False, False),
														
 
															              ('queue', 'string', 'Queue of the batch scheduler to query.',
														
 
															               False, None, False, False),
														
@@ -317,14 +317,19 @@ if __name__ == '__main__':
 
															   confDef.add_defs(defList, defOrder)
														
 
															   ringMasterOptions = options(confDef, "./%s [OPTIONS]" % myName, VERSION)
														
 
															   ensureLogDir(ringMasterOptions['ringmaster']['log-dir'])
														
 
															-  log = getLogger(ringMasterOptions['ringmaster'])
														
 
															+  log = None
														
 
															   try:
														
 
															+    log = getLogger(ringMasterOptions['ringmaster'])
														
 
															     (status, statusMsgs) = ringMasterOptions.verify()
														
 
															     if not status:
														
 
															       raise Exception("%s" % statusMsgs)
														
 
															+    ringMasterOptions.replace_escape_seqs()
														
 
															     ringMasterOptions['ringmaster']['base-dir'] = rootDirectory 
														
 
															-    main(ringMasterOptions,log)
														
 
															-    sys.exit(0)
														
 
															+    ret = main(ringMasterOptions,log)
														
 
															+    sys.exit(ret)
														
 
															   except Exception, e:
														
 
															-    log.error("bin/ringmaster failed to start.%s. Stack trace follows:\n%s" % (get_exception_error_string(),get_exception_string()))
														
 
															+    if log:
														
 
															+      log.error("bin/ringmaster failed to start.%s. Stack trace follows:\n%s" % (get_exception_error_string(),get_exception_string()))
														
 
															+    # Ringmaster failing to start is a ringmaster error. Exit with the appropriate exit code.
														
 
															+    sys.exit(6)
														
--- a/src/contrib/hod/conf/hodrc
+++ b/src/contrib/hod/conf/hodrc
@@ -6,6 +6,7 @@ cluster-factor                  = 1.8
 
															 xrs-port-range                  = 32768-65536

														
 
															 debug                           = 3

														
 
															 allocate-wait-time              = 3600

														
 
															+temp-dir                        = /tmp/hod

														
 
															 [ringmaster]

														
 
															 register                        = True

														
--- a/src/contrib/hod/getting_started.txt
+++ b/src/contrib/hod/getting_started.txt
@@ -26,7 +26,7 @@ functionality from HOD:
 
															   cluster. However, it can also use a pre-installed version of Hadoop,
														
 
															   if it is available on all nodes in the cluster.
														
 
															   (http://lucene.apache.org/hadoop)
														
 
															-  HOD currently supports only Hadoop 0.16, which is under development.
														
 
															+  HOD currently supports Hadoop 0.15 and above.
														
 
															 NOTE: HOD configuration requires the location of installs of these 
														
 
															 components to be the same on all nodes in the cluster. It will also 
														
--- a/src/contrib/hod/hodlib/Common/desc.py
+++ b/src/contrib/hod/hodlib/Common/desc.py
@@ -125,38 +125,9 @@ class ServiceDesc:
 
															     self.dict.setdefault('pkgs', '')
														
 
															     self.dict.setdefault('final-attrs', {})
														
 
															     self._checkRequired()
														
 
															-    self.__dict_update()
														
 
															-
														
 
															-  def __dict_update(self):
														
 
															-    getattr(self, "_%s" % self.dict['id'])()
														
 
															-
														
 
															-  def _mapred(self):
														
 
															-    if self.isExternal():
														
 
															-      self.dict['final-attrs']['mapred.job.tracker'] = "%s:%s" % (self.dict['host'], 
														
 
															-        self.dict['tracker_port'])
														
 
															-      
														
 
															-      # self.dict['final-attrs']['mapred.job.tracker.info.port'] = \
														
 
															-      #   str(self.dict['info_port'])
														
 
															-      # After Hadoop-2185
														
 
															-      self.dict['final-attrs']['mapred.job.tracker.http.bindAddress'] = \
														
 
															-        "%s:%s" %(self.dict['host'], self.dict['info_port'])
														
 
															-      
														
 
															     if self.dict.has_key('hadoop-tar-ball'):
														
 
															       self.dict['tar'] = self.dict['hadoop-tar-ball']  
														
 
															-  
														
 
															-  def _hdfs(self):
														
 
															-    if self.isExternal():
														
 
															-      self.dict['final-attrs']['fs.default.name'] = "%s:%s" % (self.dict['host'], 
														
 
															-        self.dict['fs_port'])
														
 
															-      
														
 
															-      # self.dict['final-attrs']['dfs.info.port'] = str(self.dict['info_port'])
														
 
															-      # After Hadoop-2185
														
 
															-      self.dict['final-attrs']['dfs.http.bindAddress'] = "%s:%s" % \
														
 
															-        (self.dict['host'], self.dict['info_port'])
														
 
															-      
														
 
															-    if self.dict.has_key('hadoop-tar-ball'):
														
 
															-      self.dict['tar'] = self.dict['hadoop-tar-ball']
														
 
															-  
														
 
															+
														
 
															   def _checkRequired(self):
														
 
															     if not 'id' in self.dict:
														
--- a/src/contrib/hod/hodlib/Common/hodsvc.py
+++ b/src/contrib/hod/hodlib/Common/hodsvc.py
@@ -15,7 +15,6 @@
 
															 #limitations under the License.
														
 
															 # $Id:setup.py 5158 2007-04-09 00:14:35Z zim $
														
 
															 #
														
 
															-# Christopher Zimmerman - zim@yahoo-inc.com - 04/13/2007
														
 
															 #------------------------------------------------------------------------------
														
 
															 import os, time, shutil, xmlrpclib, socket, pprint
														
@@ -51,7 +50,7 @@ class hodBaseService:
 
															     self._init_logging()
														
 
															-    self._init_signals()
														
 
															+    if name != 'serviceRegistry': self._init_signals()
														
 
															     self._init_xrc_server()
														
 
															   def __set_logging_level(self, level):
														
--- a/src/contrib/hod/hodlib/Common/setup.py
+++ b/src/contrib/hod/hodlib/Common/setup.py
@@ -16,7 +16,6 @@
 
															 # $Id:setup.py 5158 2007-04-09 00:14:35Z zim $
														
 
															 # $Id:setup.py 5158 2007-04-09 00:14:35Z zim $
														
 
															 #
														
 
															-# Christopher Zimmerman - zim@yahoo-inc.com - 04/07/2007
														
 
															 #------------------------------------------------------------------------------
														
 
															 """'setup' provides for reading and verifing configuration files based on
														
@@ -26,7 +25,7 @@ import sys, os, re, pprint
 
															 from ConfigParser import SafeConfigParser
														
 
															 from optparse import OptionParser, IndentedHelpFormatter, OptionGroup
														
 
															-from util import get_perms
														
 
															+from util import get_perms, replace_escapes
														
 
															 from types import typeValidator, is_valid_type, typeToString
														
 
															 reEmailAddress = re.compile("^.*@.*$")
														
@@ -37,6 +36,8 @@ reCommentHack = re.compile("^.*?\s+#|;.*", flags=re.S)
 
															 reCommentNewline = re.compile("\W$")
														
 
															 reKeyVal = r"(?<!\\)="
														
 
															 reKeyVal = re.compile(reKeyVal)
														
 
															+reKeyValList = r"(?<!\\),"
														
 
															+reKeyValList = re.compile(reKeyValList)
														
 
															 errorPrefix = 'error'
														
 
															 requiredPerms = '0660'
														
@@ -485,7 +486,7 @@ class config(SafeConfigParser, baseConfig):
 
															                            # Append to the current list of values in self._dict
														
 
															                            if not self._dict[section].has_key(option):
														
 
															                              self._dict[section][option] = ""
														
 
															-                           dictOpts = self._dict[section][option].split(",")
														
 
															+                           dictOpts = reKeyValList.split(self._dict[section][option])
														
 
															                            dictOptsKeyVals = {}
														
 
															                            for opt in dictOpts:
														
 
															                               if opt != '':
														
@@ -495,13 +496,16 @@ class config(SafeConfigParser, baseConfig):
 
															                                   # we only consider the first '=' for splitting
														
 
															                                   # we do this to support passing params like
														
 
															                                   # mapred.child.java.opts=-Djava.library.path=some_dir
														
 
															+                                  # Even in case of an invalid error like unescaped '=',
														
 
															+                                  # we don't want to fail here itself. We leave such errors 
														
 
															+                                  # to be caught during validation which happens after this
														
 
															                                   dictOptsKeyVals[key] = val
														
 
															                                 else: 
														
 
															                                   # this means an invalid option. Leaving it
														
 
															                                   #for config.verify to catch
														
 
															                                   dictOptsKeyVals[opt] = None
														
 
															-                           cmdLineOpts = self._options[section][option].split(",")
														
 
															+                           cmdLineOpts = reKeyValList.split(self._options[section][option])
														
 
															                            for opt in cmdLineOpts:
														
 
															                               if reKeyVal.search(opt):
														
@@ -573,6 +577,10 @@ class config(SafeConfigParser, baseConfig):
 
															             raise Exception( error)
														
 
															             sys.exit(1)
														
 
															+    def replace_escape_seqs(self):
														
 
															+      """ replace any escaped characters """
														
 
															+      replace_escapes(self)
														
 
															+
														
 
															 class formatter(IndentedHelpFormatter):
														
 
															     def format_option_strings(self, option):
														
 
															         """Return a comma-separated list of option strings & metavariables."""
														
@@ -667,11 +675,21 @@ class options(OptionParser, baseConfig):
 
															             self.config = self.__parsedOptions.config
														
 
															             if not self.config:
														
 
															                 self.error("configuration file must be specified")
														
 
															+            if not os.path.isabs(self.config):
														
 
															+                # A relative path. Append the original directory which would be the
														
 
															+                # current directory at the time of launch
														
 
															+                try:  
														
 
															+                    origDir = getattr(self.__parsedOptions, 'hod.original-dir')
														
 
															+                    if origDir is not None:
														
 
															+                        self.config = os.path.join(origDir, self.config)
														
 
															+                        self.__parsedOptions.config = self.config
														
 
															+                except AttributeError, e:
														
 
															+                    self.error("hod.original-dir is not defined.\
														
 
															+                                   Cannot get current directory")
														
 
															             if not os.path.exists(self.config):
														
 
															                 if self.__defaultLoc and not re.search("/", self.config):
														
 
															                     self.__parsedOptions.config = os.path.join(
														
 
															                         self.__defaultLoc, self.config)
														
 
															-    
														
 
															         self.__build_dict()   
														
@@ -910,3 +928,6 @@ class options(OptionParser, baseConfig):
 
															     def verify(self):
														
 
															         return baseConfig.verify(self)
														
 
															+
														
 
															+    def replace_escape_seqs(self):
														
 
															+      replace_escapes(self)
														
--- a/src/contrib/hod/hodlib/Common/socketServers.py
+++ b/src/contrib/hod/hodlib/Common/socketServers.py
@@ -15,7 +15,6 @@
 
															 #limitations under the License.
														
 
															 # Various socket server and helper classes.
														
 
															 #
														
 
															-# Christopher Zimmerman - zim@yahoo-inc.com - 03/07/2007
														
 
															 #
														
 
															 import os, sys, socket, threading, pprint, re, xmlrpclib, time
														
--- a/src/contrib/hod/hodlib/Common/tcp.py
+++ b/src/contrib/hod/hodlib/Common/tcp.py
@@ -15,7 +15,6 @@
 
															 #limitations under the License.
														
 
															 # $Id:tcp.py 6172 2007-05-22 20:26:54Z zim $
														
 
															 #
														
 
															-# Christopher Zimmerman - zim@yahoo-inc.com - 04/07/2007
														
 
															 #------------------------------------------------------------------------------
														
 
															 """ TCP related classes. """
														
--- a/src/contrib/hod/hodlib/Common/threads.py
+++ b/src/contrib/hod/hodlib/Common/threads.py
@@ -132,13 +132,16 @@ class simpleCommand(baseThread):
 
															                 output = cmd.fromchild.readline()
														
 
															         elif self.__wait == False:
														
 
															-            for output in cmd.fromchild.readlines():
														
 
															+            output = cmd.fromchild.readline()
														
 
															+            while output != '':
														
 
															                 while not self.running.isSet():
														
 
															                     if self.stopFlag.isSet():
														
 
															                         break
														
 
															                     time.sleep(1)
														
 
															-                
														
 
															                 print output,
														
 
															+                if self.stopFlag.isSet():
														
 
															+                    break
														
 
															+                output = cmd.fromchild.readline()
														
 
															         else:
														
 
															             self.stdout = cmd.fromchild
														
--- a/src/contrib/hod/hodlib/Common/types.py
+++ b/src/contrib/hod/hodlib/Common/types.py
@@ -15,7 +15,6 @@
 
															 #limitations under the License.
														
 
															 # $Id:types.py 6172 2007-05-22 20:26:54Z zim $
														
 
															 #
														
 
															-# Christopher Zimmerman - zim@yahoo-inc.com - 04/07/2007
														
 
															 #------------------------------------------------------------------------------
														
 
															 """ Higher level data types and type related classes.
														
@@ -325,12 +324,17 @@ class typeToString:
 
															         return value
														
 
															     def __tostring_keyval(self, value):
														
 
															-        string = ''
														
 
															+        string = '"' # to protect from shell escapes
														
 
															         for key in value:
														
 
															-            for item in value[key]:
														
 
															-                string = "%s%s=%s," % (string, key, item)
														
 
															-                
														
 
															-        return string[:-1]  
														
 
															+          # for item in value[key]:
														
 
															+          #      string = "%s%s=%s," % (string, key, item)
														
 
															+          # Quotes still cannot protect Double-slashes.
														
 
															+          # Dealing with them separately
														
 
															+          val = re.sub(r"\\\\",r"\\\\\\\\",value[key])
														
 
															+
														
 
															+          string = "%s%s=%s," % (string, key, val)
														
 
															+
														
 
															+        return string[:-1] + '"'
														
 
															     def __tostring_list(self, value):
														
 
															         string = ''
														
@@ -678,13 +682,11 @@ class typeValidator:
 
															         list = self.__norm_list(value)
														
 
															         keyValue = {}
														
 
															         for item in list:
														
 
															-            # we only consider the first '=' for splitting
														
 
															-            # we do this to support passing params like 
														
 
															-            # mapred.child.java.opts=-Djava.library.path=some_dir
														
 
															-            (key, value) = reKeyVal.split(item,1)
														
 
															-            if not keyValue.has_key(key):
														
 
															-                keyValue[key] = []
														
 
															-            keyValue[key].append(value)
														
 
															+            (key, value) = reKeyVal.split(item)
														
 
															+            #if not keyValue.has_key(key):
														
 
															+            #    keyValue[key] = []
														
 
															+            #keyValue[key].append(value)
														
 
															+            keyValue[key] = value
														
 
															         return keyValue     
														
 
															     def __verify_list(self, type, value):
														
--- a/src/contrib/hod/hodlib/Common/util.py
+++ b/src/contrib/hod/hodlib/Common/util.py
@@ -13,12 +13,17 @@
 
															 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															 #See the License for the specific language governing permissions and
														
 
															 #limitations under the License.
														
 
															-import sys, os, traceback, stat, socket, re, warnings
														
 
															+import sys, os, traceback, stat, socket, re, warnings, signal
														
 
															 from hodlib.Common.tcp import tcpSocket, tcpError 
														
 
															 from hodlib.Common.threads import simpleCommand
														
 
															 setUGV   = { 'S_ISUID' : 2, 'S_ISGID' : 1, 'S_ISVTX' : 0 }
														
 
															+reEscapeSeq = r"\\(.)?"
														
 
															+reEscapeSeq = re.compile(reEscapeSeq)
														
 
															+
														
 
															+HOD_INTERRUPTED_CODE = 127
														
 
															+HOD_INTERRUPTED_MESG = "Hod Interrupted. Cleaning up and exitting"
														
 
															 class AlarmException(Exception):
														
 
															     def __init__(self, msg=''):
														
@@ -170,3 +175,117 @@ def args_to_string(list):
 
															   for item in list:
														
 
															     arg = "%s%s " % (arg, item)
														
 
															   return arg[:-1]
														
 
															+
														
 
															+def replace_escapes(object):
														
 
															+  """ replace any escaped character. e.g \, with , \= with = and so on """
														
 
															+  # here object is either a config object or a options object
														
 
															+  for section in object._mySections:
														
 
															+    for option in object._configDef[section].keys():
														
 
															+      if object[section].has_key(option):
														
 
															+        if object._configDef[section][option]['type'] == 'keyval':
														
 
															+          keyValDict = object[section][option]
														
 
															+          object[section][option] = {}
														
 
															+          for (key,value) in keyValDict.iteritems():
														
 
															+            match = reEscapeSeq.search(value)
														
 
															+            if match:
														
 
															+              value = reEscapeSeq.sub(r"\1", value)
														
 
															+            object[section][option][key] = value
														
 
															+
														
 
															+def hadoopVersion(hadoopDir, java_home, log):
														
 
															+  # Determine the version of hadoop being used by executing the 
														
 
															+  # hadoop version command. Code earlier in idleTracker.py
														
 
															+  hadoopVersion = { 'major' : None, 'minor' : None }
														
 
															+  hadoopPath = os.path.join(hadoopDir, 'bin', 'hadoop')
														
 
															+  cmd = "%s version" % hadoopPath
														
 
															+  log.debug('Executing command %s to find hadoop version' % cmd)
														
 
															+  env = os.environ
														
 
															+  env['JAVA_HOME'] = java_home
														
 
															+  hadoopVerCmd = simpleCommand('HadoopVersion', cmd, env)
														
 
															+  hadoopVerCmd.start()
														
 
															+  hadoopVerCmd.wait()
														
 
															+  hadoopVerCmd.join()
														
 
															+  if hadoopVerCmd.exit_code() == 0:
														
 
															+    verLine = hadoopVerCmd.output()[0]
														
 
															+    log.debug('Version from hadoop command: %s' % verLine)
														
 
															+    hadoopVerRegExp = re.compile("Hadoop ([0-9]+)\.([0-9]+).*")
														
 
															+    verMatch = hadoopVerRegExp.match(verLine)
														
 
															+    if verMatch != None:
														
 
															+      hadoopVersion['major'] = verMatch.group(1)
														
 
															+      hadoopVersion['minor'] = verMatch.group(2)
														
 
															+  return hadoopVersion
														
 
															+
														
 
															+
														
 
															+def get_cluster_status(hdfsAddress, mapredAddress):
														
 
															+  """Determine the status of the cluster based on socket availability
														
 
															+     of HDFS and Map/Reduce."""
														
 
															+  status = 0
														
 
															+
														
 
															+  mapredSocket = tcpSocket(mapredAddress)
														
 
															+  try:
														
 
															+    mapredSocket.open()
														
 
															+    mapredSocket.close()
														
 
															+  except tcpError:
														
 
															+    status = 14
														
 
															+
														
 
															+  hdfsSocket = tcpSocket(hdfsAddress)
														
 
															+  try:
														
 
															+    hdfsSocket.open()
														
 
															+    hdfsSocket.close()
														
 
															+  except tcpError:
														
 
															+    if status > 0:
														
 
															+      status = 10
														
 
															+    else:
														
 
															+      status = 13
														
 
															+
														
 
															+  return status
														
 
															+
														
 
															+def parseEquals(list):
														
 
															+  # takes in a list of keyval pairs e.g ['a=b','c=d'] and returns a
														
 
															+  # dict e.g {'a'='b','c'='d'}. Used in GridService/{mapred.py/hdfs.py} and 
														
 
															+  # HodRing/hodring.py. No need for specially treating escaped =. as in \=,
														
 
															+  # since all keys are generated by hod and don't contain such anomalies
														
 
															+  dict = {}
														
 
															+  for elems in list:
														
 
															+    splits = elems.split('=')
														
 
															+    dict[splits[0]] = splits[1]
														
 
															+  return dict
														
 
															+
														
 
															+class HodInterrupt:
														
 
															+  def __init__(self):
														
 
															+    self.HodInterruptFlag = False
														
 
															+    self.log = None
														
 
															+
														
 
															+  def set_log(self, log):
														
 
															+    self.log = log
														
 
															+
														
 
															+  def init_signals(self):
														
 
															+
														
 
															+    def sigStop(sigNum, handler):
														
 
															+      sig_wrapper(sigNum, self.setFlag)
														
 
															+
														
 
															+    signal.signal(signal.SIGTERM, sigStop) # 15 : software termination signal
														
 
															+    signal.signal(signal.SIGQUIT, sigStop) # 3  : Quit program
														
 
															+    signal.signal(signal.SIGINT, sigStop)  # 2 ^C : Interrupt program
														
 
															+
														
 
															+    def sig_wrapper(sigNum, handler, *args):
														
 
															+      self.log.critical("Caught signal %s." % sigNum )
														
 
															+
														
 
															+      if args:
														
 
															+          handler(args)
														
 
															+      else:
														
 
															+          handler()
														
 
															+
														
 
															+  def setFlag(self, val = True):
														
 
															+    self.HodInterruptFlag = val
														
 
															+
														
 
															+  def isSet(self):
														
 
															+    return self.HodInterruptFlag
														
 
															+
														
 
															+class HodInterruptException(Exception):
														
 
															+  def __init__(self, value = ""):
														
 
															+    self.value = value
														
 
															+    
														
 
															+  def __str__(self):
														
 
															+    return repr(self.value)
														
 
															+
														
 
															+hodInterrupt = HodInterrupt()
														
--- a/src/contrib/hod/hodlib/Common/xmlrpc.py
+++ b/src/contrib/hod/hodlib/Common/xmlrpc.py
@@ -14,6 +14,7 @@
 
															 #See the License for the specific language governing permissions and
														
 
															 #limitations under the License.
														
 
															 import xmlrpclib, time, random, signal
														
 
															+from hodlib.Common.util import hodInterrupt
														
 
															 class hodXRClient(xmlrpclib.ServerProxy):
														
 
															     def __init__(self, uri, transport=None, encoding=None, verbose=0,
														
@@ -42,6 +43,8 @@ class hodXRClient(xmlrpclib.ServerProxy):
 
															                 break
														
 
															             except Exception:
														
 
															                 if self.__retryRequests:
														
 
															+                  if hodInterrupt.isSet():
														
 
															+                    raise HodInterruptException()
														
 
															                   time.sleep(retryWaitTime)
														
 
															                 else:
														
 
															                   raise Exception("hodXRClientTimeout")
														
--- a/src/contrib/hod/hodlib/GridServices/hdfs.py
+++ b/src/contrib/hod/hodlib/GridServices/hdfs.py
@@ -22,15 +22,16 @@ import os
 
															 from service import *
														
 
															 from hodlib.Hod.nodePool import *
														
 
															 from hodlib.Common.desc import CommandDesc
														
 
															-from hodlib.Common.util import get_exception_string
														
 
															+from hodlib.Common.util import get_exception_string, parseEquals
														
 
															 class HdfsExternal(MasterSlave):
														
 
															   """dummy proxy to external HDFS instance"""
														
 
															-  def __init__(self, serviceDesc, workDirs):
														
 
															+  def __init__(self, serviceDesc, workDirs, version):
														
 
															     MasterSlave.__init__(self, serviceDesc, workDirs,None)
														
 
															     self.launchedMaster = True
														
 
															     self.masterInitialized = True
														
 
															+    self.version = version
														
 
															   def getMasterRequest(self):
														
 
															     return None
														
@@ -49,21 +50,33 @@ class HdfsExternal(MasterSlave):
 
															     addr = attrs['fs.default.name']
														
 
															     return [addr]
														
 
															-  def setMasterParams(self, list):
														
 
															-    raise NotImplementedError
														
 
															+  def setMasterParams(self, dict):
														
 
															+   self.serviceDesc.dict['final-attrs']['fs.default.name'] = "%s:%s" % \
														
 
															+     (dict['host'], dict['fs_port'])
														
 
															+
														
 
															+   if self.version < 16:
														
 
															+    self.serviceDesc.dict['final-attrs']['dfs.info.port'] = \
														
 
															+                                    str(self.serviceDesc.dict['info_port'])
														
 
															+   else:
														
 
															+     # After Hadoop-2185
														
 
															+     self.serviceDesc.dict['final-attrs']['dfs.http.bindAddress'] = "%s:%s" % \
														
 
															+       (dict['host'], dict['info_port'])
														
 
															   def getInfoAddrs(self):
														
 
															     attrs = self.serviceDesc.getfinalAttrs()
														
 
															-    addr = attrs['fs.default.name']
														
 
															-    k,v = addr.split( ":")
														
 
															-    # infoaddr = k + ':' + attrs['dfs.info.port']
														
 
															-    # After Hadoop-2185
														
 
															-    infoaddr = attrs['dfs.http.bindAddress']
														
 
															+    if self.version < 16:
														
 
															+      addr = attrs['fs.default.name']
														
 
															+      k,v = addr.split( ":")
														
 
															+      infoaddr = k + ':' + attrs['dfs.info.port']
														
 
															+    else:
														
 
															+      # After Hadoop-2185
														
 
															+      infoaddr = attrs['dfs.http.bindAddress']
														
 
															     return [infoaddr]
														
 
															 class Hdfs(MasterSlave):
														
 
															-  def __init__(self, serviceDesc, nodePool, required_node, format=True, upgrade=False):
														
 
															+  def __init__(self, serviceDesc, nodePool, required_node, version, \
														
 
															+                                        format=True, upgrade=False):
														
 
															     MasterSlave.__init__(self, serviceDesc, nodePool, required_node)
														
 
															     self.masterNode = None
														
 
															     self.masterAddr = None
														
@@ -73,6 +86,7 @@ class Hdfs(MasterSlave):
 
															     self.format = format
														
 
															     self.upgrade = upgrade
														
 
															     self.workers = []
														
 
															+    self.version = version
														
 
															   def getMasterRequest(self):
														
 
															     req = NodeRequest(1, [], False)
														
@@ -124,16 +138,14 @@ class Hdfs(MasterSlave):
 
															     self.masterAddr = dict['fs.default.name']
														
 
															     k,v = self.masterAddr.split( ":")
														
 
															     self.masterNode = k
														
 
															-    # self.infoAddr = self.masterNode + ':' + dict['dfs.info.port']
														
 
															-    # After Hadoop-2185
														
 
															-    self.infoAddr = dict['dfs.http.bindAddress']
														
 
															+    if self.version < 16:
														
 
															+      self.infoAddr = self.masterNode + ':' + dict['dfs.info.port']
														
 
															+    else:
														
 
															+      # After Hadoop-2185
														
 
															+      self.infoAddr = dict['dfs.http.bindAddress']
														
 
															   def _parseEquals(self, list):
														
 
															-    dict = {}
														
 
															-    for elems in list:
														
 
															-      splits = elems.split('=')
														
 
															-      dict[splits[0]] = splits[1]
														
 
															-    return dict
														
 
															+    return parseEquals(list)
														
 
															   def _getNameNodePort(self):
														
 
															     sd = self.serviceDesc
														
@@ -152,16 +164,25 @@ class Hdfs(MasterSlave):
 
															   def _getNameNodeInfoPort(self):
														
 
															     sd = self.serviceDesc
														
 
															     attrs = sd.getfinalAttrs()
														
 
															-    if 'dfs.http.bindAddress' not in attrs:
														
 
															-      return ServiceUtil.getUniqPort()
														
 
															+    if self.version < 16:
														
 
															+      if 'dfs.info.bindAddress' not in attrs:
														
 
															+        return ServiceUtil.getUniqPort()
														
 
															+    else:
														
 
															+      if 'dfs.http.bindAddress' not in attrs:
														
 
															+        return ServiceUtil.getUniqPort()
														
 
															-    # p = attrs['dfs.info.port'] 
														
 
															-    p = attrs['dfs.http.bindAddress'].split(':')[1]
														
 
															+    if self.version < 16:
														
 
															+      p = attrs['dfs.info.port']
														
 
															+    else:
														
 
															+      p = attrs['dfs.http.bindAddress'].split(':')[1]
														
 
															     try:
														
 
															       return int(p)
														
 
															     except:
														
 
															       print get_exception_string()
														
 
															-      raise ValueError, "Can't find port from attr dfs.info.port: %s" % (p)
														
 
															+      if self.version < 16:
														
 
															+        raise ValueError, "Can't find port from attr dfs.info.port: %s" % (p)
														
 
															+      else:
														
 
															+        raise ValueError, "Can't find port from attr dfs.http.bindAddress: %s" % (p)
														
 
															   def _setWorkDirs(self, workDirs, envs, attrs, parentDirs, subDir):
														
 
															     namedir = None
														
@@ -183,7 +204,7 @@ class Hdfs(MasterSlave):
 
															     attrs['dfs.name.dir'] = namedir
														
 
															     attrs['dfs.data.dir'] = ','.join(datadir)
														
 
															     # FIXME -- change dfs.client.buffer.dir
														
 
															-    envs['HADOOP_ROOT_LOGGER'] = ["INFO,DRFA",]
														
 
															+    envs['HADOOP_ROOT_LOGGER'] = "INFO,DRFA"
														
 
															   def _getNameNodeCommand(self, format=False, upgrade=False):
														
@@ -199,13 +220,14 @@ class Hdfs(MasterSlave):
 
															       attrs['fs.default.name'] = 'fillinhostport'
														
 
															     #self.infoPort = port = self._getNameNodeInfoPort()
														
 
															-    # if 'dfs.info.port' not in attrs:
														
 
															-    #  attrs['dfs.info.port'] = 'fillinport'
														
 
															-   
														
 
															-    # Addressing Hadoop-2815, added the following. Earlier version don't
														
 
															-    # care about this
														
 
															-    if 'dfs.http.bindAddress' not in attrs:
														
 
															-      attrs['dfs.http.bindAddress'] = 'fillinhostport'
														
 
															+    if self.version < 16:
														
 
															+     if 'dfs.info.port' not in attrs:
														
 
															+      attrs['dfs.info.port'] = 'fillinport'
														
 
															+    else:
														
 
															+      # Addressing Hadoop-2815, added the following. Earlier versions don't
														
 
															+      # care about this
														
 
															+      if 'dfs.http.bindAddress' not in attrs:
														
 
															+        attrs['dfs.http.bindAddress'] = 'fillinhostport'
														
 
															     self._setWorkDirs(workDirs, envs, attrs, parentDirs, 'hdfs-nn')
														
@@ -277,11 +299,18 @@ class Hdfs(MasterSlave):
 
															     attrs['fs.default.name'] = nn
														
 
															-    # Adding the following. Hadoop-2815
														
 
															-    if 'dfs.datanode.bindAddress' not in attrs:
														
 
															-      attrs['dfs.datanode.bindAddress'] = 'fillinhostport'
														
 
															-    if 'dfs.datanode.http.bindAddress' not in attrs:
														
 
															-      attrs['dfs.datanode.http.bindAddress'] = 'fillinhostport'
														
 
															+    if self.version < 16:
														
 
															+      if 'dfs.datanode.port' not in attrs:
														
 
															+        attrs['dfs.datanode.port'] = 'fillinport'
														
 
															+      if 'dfs.datanode.info.port' not in attrs:
														
 
															+        attrs['dfs.datanode.info.port'] = 'fillinport'
														
 
															+    else:
														
 
															+      # Adding the following. Hadoop-2815
														
 
															+      if 'dfs.datanode.bindAddress' not in attrs:
														
 
															+        attrs['dfs.datanode.bindAddress'] = 'fillinhostport'
														
 
															+      if 'dfs.datanode.http.bindAddress' not in attrs:
														
 
															+        attrs['dfs.datanode.http.bindAddress'] = 'fillinhostport'
														
 
															+    
														
 
															     self._setWorkDirs(workDirs, envs, attrs, parentDirs, 'hdfs-dn')
														
 
															     dict = { 'name' : 'datanode' }
														
--- a/src/contrib/hod/hodlib/GridServices/mapred.py
+++ b/src/contrib/hod/hodlib/GridServices/mapred.py
@@ -22,15 +22,16 @@ import os, copy, time
 
															 from service import *
														
 
															 from hodlib.Hod.nodePool import *
														
 
															 from hodlib.Common.desc import CommandDesc
														
 
															-from hodlib.Common.util import get_exception_string
														
 
															+from hodlib.Common.util import get_exception_string, parseEquals
														
 
															 class MapReduceExternal(MasterSlave):
														
 
															   """dummy proxy to external MapReduce instance"""
														
 
															-  def __init__(self, serviceDesc, workDirs):
														
 
															+  def __init__(self, serviceDesc, workDirs, version):
														
 
															     MasterSlave.__init__(self, serviceDesc, workDirs,None)
														
 
															     self.launchedMaster = True
														
 
															     self.masterInitialized = True
														
 
															+    self.version = version
														
 
															   def getMasterRequest(self):
														
 
															     return None
														
@@ -55,22 +56,33 @@ class MapReduceExternal(MasterSlave):
 
															   def needsLess(self):
														
 
															     return 0
														
 
															-  def setMasterParams(self, list):
														
 
															-    raise NotImplementedError
														
 
															-  
														
 
															+  def setMasterParams(self, dict):
														
 
															+    self.serviceDesc['final-attrs']['mapred.job.tracker'] = "%s:%s" % (dict['host'], 
														
 
															+      dict['tracker_port'])
														
 
															+    
														
 
															+    if self.version < 16:
														
 
															+      self.serviceDesc.dict['final-attrs']['mapred.job.tracker.info.port'] = \
														
 
															+                                      str(self.serviceDesc.dict['info_port'])
														
 
															+    else:
														
 
															+      # After Hadoop-2185
														
 
															+      self.serviceDesc['final-attrs']['mapred.job.tracker.http.bindAddress'] = \
														
 
															+        "%s:%s" %(dict['host'], dict['info_port'])
														
 
															+
														
 
															   def getInfoAddrs(self):
														
 
															     attrs = self.serviceDesc.getfinalAttrs()
														
 
															-    addr = attrs['mapred.job.tracker']
														
 
															-    k,v = addr.split( ":")
														
 
															-    # infoaddr = k + ':' + attrs['mapred.job.tracker.info.port']
														
 
															-    # After Hadoop-2185
														
 
															-    # Note: earlier,we never respected mapred.job.tracker.http.bindAddress
														
 
															-    infoaddr = attrs['mapred.job.tracker.http.bindAddress']
														
 
															+    if self.version < 16:
														
 
															+      addr = attrs['mapred.job.tracker']
														
 
															+      k,v = addr.split( ":")
														
 
															+      infoaddr = k + ':' + attrs['mapred.job.tracker.info.port']
														
 
															+    else:
														
 
															+      # After Hadoop-2185
														
 
															+      # Note: earlier,we never respected mapred.job.tracker.http.bindAddress
														
 
															+      infoaddr = attrs['mapred.job.tracker.http.bindAddress']
														
 
															     return [infoaddr]
														
 
															 class MapReduce(MasterSlave):
														
 
															-  def __init__(self, serviceDesc, workDirs,required_node):
														
 
															+  def __init__(self, serviceDesc, workDirs,required_node, version):
														
 
															     MasterSlave.__init__(self, serviceDesc, workDirs,required_node)
														
 
															     self.masterNode = None
														
@@ -78,6 +90,7 @@ class MapReduce(MasterSlave):
 
															     self.infoAddr = None
														
 
															     self.workers = []
														
 
															     self.required_node = required_node
														
 
															+    self.version = version
														
 
															   def isLaunchable(self, serviceDict):
														
 
															     hdfs = serviceDict['hdfs']
														
@@ -127,16 +140,14 @@ class MapReduce(MasterSlave):
 
															     self.masterAddr = dict['mapred.job.tracker']
														
 
															     k,v = self.masterAddr.split(":")
														
 
															     self.masterNode = k
														
 
															-    # self.infoAddr = self.masterNode + ':' + dict['mapred.job.tracker.info.port']
														
 
															-    # After Hadoop-2185
														
 
															-    self.infoAddr = dict['mapred.job.tracker.http.bindAddress']
														
 
															+    if self.version < 16:
														
 
															+      self.infoAddr = self.masterNode + ':' + dict['mapred.job.tracker.info.port']
														
 
															+    else:
														
 
															+      # After Hadoop-2185
														
 
															+      self.infoAddr = dict['mapred.job.tracker.http.bindAddress']
														
 
															   def _parseEquals(self, list):
														
 
															-    dict = {}
														
 
															-    for elems in list:
														
 
															-      splits = elems.split('=')
														
 
															-      dict[splits[0]] = splits[1]
														
 
															-    return dict
														
 
															+    return parseEquals(list)
														
 
															   def _getJobTrackerPort(self):
														
 
															     sd = self.serviceDesc
														
@@ -152,21 +163,29 @@ class MapReduce(MasterSlave):
 
															       print get_exception_string()
														
 
															       raise ValueError, "Can't find port from attr mapred.job.tracker: %s" % (v)
														
 
															+  # UNUSED METHOD
														
 
															   def _getJobTrackerInfoPort(self):
														
 
															     sd = self.serviceDesc
														
 
															     attrs = sd.getfinalAttrs()
														
 
															-    # if not 'mapred.job.tracker.info.port' in attrs:
														
 
															-    if 'mapred.job.tracker.http.bindAddress' not in attrs:
														
 
															-      return ServiceUtil.getUniqPort()
														
 
															-
														
 
															-    # p = attrs['mapred.job.tracker.info.port']
														
 
															-    p = attrs['mapred.job.tracker.http.bindAddress']
														
 
															+    if self.version < 16:
														
 
															+      if not 'mapred.job.tracker.info.port' in attrs:
														
 
															+        return ServiceUtil.getUniqPort()
														
 
															+    else:
														
 
															+      if 'mapred.job.tracker.http.bindAddress' not in attrs:
														
 
															+        return ServiceUtil.getUniqPort()
														
 
															+
														
 
															+    if self.version < 16:
														
 
															+      p = attrs['mapred.job.tracker.info.port']
														
 
															+    else:
														
 
															+      p = attrs['mapred.job.tracker.http.bindAddress'].split(':')[1]
														
 
															     try:
														
 
															       return int(p)
														
 
															     except:
														
 
															       print get_exception_string()
														
 
															-      # raise ValueError, "Can't find port from attr mapred.job.tracker.info.port: %s" % (p)
														
 
															-      raise ValueError, "Can't find port from attr mapred.job.tracker.http.bindAddress: %s" % (p)
														
 
															+      if self.version < 16:
														
 
															+        raise ValueError, "Can't find port from attr mapred.job.tracker.info.port: %s" % (p)
														
 
															+      else:
														
 
															+        raise ValueError, "Can't find port from attr mapred.job.tracker.http.bindAddress: %s" % (p)
														
 
															   def _setWorkDirs(self, workDirs, envs, attrs, parentDirs, subDir):
														
 
															     local = []
														
@@ -193,7 +212,7 @@ class MapReduce(MasterSlave):
 
															     attrs['dfs.client.buffer.dir'] = ','.join(dfsclient)
														
 
															-    envs['HADOOP_ROOT_LOGGER'] = ["INFO,DRFA",]
														
 
															+    envs['HADOOP_ROOT_LOGGER'] = "INFO,DRFA"
														
 
															   def _getJobTrackerCommand(self, hdfs):
														
@@ -201,25 +220,28 @@ class MapReduce(MasterSlave):
 
															     parentDirs = self.workDirs
														
 
															     workDirs = []
														
 
															-    attrs = sd.getfinalAttrs()
														
 
															-    envs = sd.getEnvs()
														
 
															+    attrs = sd.getfinalAttrs().copy()
														
 
															+    envs = sd.getEnvs().copy()
														
 
															     #self.masterPort = port = self._getJobTrackerPort()
														
 
															     if 'mapred.job.tracker' not in attrs:
														
 
															       attrs['mapred.job.tracker'] = 'fillinhostport'
														
 
															     #self.infoPort = port = self._getJobTrackerInfoPort()
														
 
															-    # if 'mapred.job.tracker.info.port' not in attrs:
														
 
															-    #   attrs['mapred.job.tracker.info.port'] = 'fillinport'
														
 
															+    if self.version < 16:
														
 
															+      if 'mapred.job.tracker.info.port' not in attrs:
														
 
															+        attrs['mapred.job.tracker.info.port'] = 'fillinport'
														
 
															+    else:
														
 
															+      # Addressing Hadoop-2815,
														
 
															+      if 'mapred.job.tracker.http.bindAddress' not in attrs:
														
 
															+        attrs['mapred.job.tracker.http.bindAddress'] = 'fillinhostport'
														
 
															     attrs['fs.default.name'] = hdfs.getMasterAddrs()[0]
														
 
															-    # Addressing Hadoop-2815,
														
 
															-    if 'mapred.job.tracker.http.bindAddress' not in attrs:
														
 
															-      attrs['mapred.job.tracker.http.bindAddress'] = 'fillinhostport'
														
 
															     self._setWorkDirs(workDirs, envs, attrs, parentDirs, 'mapred-jt')
														
 
															     dict = { 'name' : 'jobtracker' }
														
 
															+    dict['version'] = self.version
														
 
															     dict['program'] = os.path.join('bin', 'hadoop')
														
 
															     dict['argv'] = ['jobtracker']
														
 
															     dict['envs'] = envs
														
@@ -236,8 +258,8 @@ class MapReduce(MasterSlave):
 
															     parentDirs = self.workDirs
														
 
															     workDirs = []
														
 
															-    attrs = sd.getfinalAttrs()
														
 
															-    envs = sd.getEnvs()
														
 
															+    attrs = sd.getfinalAttrs().copy()
														
 
															+    envs = sd.getEnvs().copy()
														
 
															     jt = self.masterAddr
														
 
															     if jt == None:
														
@@ -246,11 +268,17 @@ class MapReduce(MasterSlave):
 
															     attrs['mapred.job.tracker'] = jt
														
 
															     attrs['fs.default.name'] = hdfs.getMasterAddrs()[0]
														
 
															-    # Adding the following. Hadoop-2815
														
 
															-    if 'mapred.task.tracker.report.bindAddress' not in attrs:
														
 
															-      attrs['mapred.task.tracker.report.bindAddress'] = 'fillinhostport'
														
 
															-    if 'mapred.task.tracker.http.bindAddress' not in attrs:
														
 
															-      attrs['mapred.task.tracker.http.bindAddress'] = 'fillinhostport'
														
 
															+    if self.version < 16:
														
 
															+      if 'tasktracker.http.port' not in attrs:
														
 
															+        attrs['tasktracker.http.port'] = 'fillinport'
														
 
															+      # earlier to 16, tasktrackers always took ephemeral port 0 for
														
 
															+      # tasktracker.report.bindAddress
														
 
															+    else:
														
 
															+      # Adding the following. Hadoop-2815
														
 
															+      if 'mapred.task.tracker.report.bindAddress' not in attrs:
														
 
															+        attrs['mapred.task.tracker.report.bindAddress'] = 'fillinhostport'
														
 
															+      if 'mapred.task.tracker.http.bindAddress' not in attrs:
														
 
															+        attrs['mapred.task.tracker.http.bindAddress'] = 'fillinhostport'
														
 
															     self._setWorkDirs(workDirs, envs, attrs, parentDirs, 'mapred-tt')
														
--- a/src/contrib/hod/hodlib/Hod/hadoop.py
+++ b/src/contrib/hod/hodlib/Hod/hadoop.py
@@ -57,8 +57,8 @@ class hadoopConfig:
 
															     return prop
														
 
															-  def gen_site_conf(self, confDir, numNodes, hdfsAddr, mapredAddr=None,\
														
 
															-             clientParams=None, serverParams=None,\
														
 
															+  def gen_site_conf(self, confDir, tempDir, numNodes, hdfsAddr,\
														
 
															+             mapredAddr=None, clientParams=None, serverParams=None,\
														
 
															              finalServerParams=None, clusterFactor=None):
														
 
															     if not mapredAddr:
														
 
															       mapredAddr = "dummy:8181"
														
@@ -69,51 +69,58 @@ class hadoopConfig:
 
															       "This is an auto generated hadoop-site.xml, do not modify")
														
 
															     topElement = doc.documentElement
														
 
															     topElement.appendChild(comment)
														
 
															-    prop = self.__create_xml_element(doc, 'mapred.job.tracker', 
														
 
															-                                     mapredAddr, "description")
														
 
															-    topElement.appendChild(prop)
														
 
															-    prop = self.__create_xml_element(doc, 'fs.default.name', hdfsAddr, 
														
 
															-                                   "description")
														
 
															-    topElement.appendChild(prop)
														
 
															+
														
 
															+    description = {}
														
 
															+    paramsDict = {  'mapred.job.tracker'    : mapredAddr , \
														
 
															+                    'fs.default.name'       : hdfsAddr, \
														
 
															+                    'hadoop.tmp.dir'        : confDir, \
														
 
															+                    'dfs.client.buffer.dir' : tempDir, }
														
 
															+
														
 
															     mapredAddrSplit = mapredAddr.split(":")
														
 
															     mapredsystem = os.path.join('/mapredsystem', mapredAddrSplit[0])
														
 
															-    prop = self.__create_xml_element(doc, 'mapred.system.dir', mapredsystem, 
														
 
															-                                   "description", True )
														
 
															-    topElement.appendChild(prop)
														
 
															-    prop = self.__create_xml_element(doc, 'hadoop.tmp.dir', confDir, 
														
 
															-                                   "description")
														
 
															-    topElement.appendChild(prop)
														
 
															-    prop = self.__create_xml_element(doc, 'dfs.client.buffer.dir', 
														
 
															-                                     confDir, "description")
														
 
															-    topElement.appendChild(prop)
														
 
															-
														
 
															-    # clientParams aer enabled now
														
 
															-    if clientParams:
														
 
															-      for k, v in clientParams.iteritems():
														
 
															-        prop = self.__create_xml_element(doc, k, v[0], "client param")
														
 
															-        topElement.appendChild(prop)
														
 
															-
														
 
															+    paramsDict['mapred.system.dir'] = mapredsystem 
														
 
															+    
														
 
															+    # mapred-default.xml is no longer used now.
														
 
															+    numred = int(math.floor(clusterFactor * (int(numNodes) - 1)))
														
 
															+    paramsDict['mapred.reduce.tasks'] = str(numred)
														
 
															     # end
														
 
															-    # servelParams
														
 
															-    if serverParams:
														
 
															-      for k, v in serverParams.iteritems():
														
 
															-        prop = self.__create_xml_element(doc, k, v[0], "server param")
														
 
															-        topElement.appendChild(prop)
														
 
															+    # for all the above vars generated, set the description
														
 
															+    for k, v in paramsDict.iteritems():
														
 
															+      description[k] = 'Hod generated parameter'
														
 
															     # finalservelParams
														
 
															     if finalServerParams:
														
 
															       for k, v in finalServerParams.iteritems():
														
 
															-        prop = self.__create_xml_element(doc, k, v[0], "server param", True)
														
 
															-        topElement.appendChild(prop)
														
 
															+        if not description.has_key(k):
														
 
															+          description[k] = "final server parameter"
														
 
															+          paramsDict[k] = v
														
 
															-   
														
 
															-    # mapred-default.xml is no longer used now.
														
 
															-    numred = int(math.floor(clusterFactor * (int(numNodes) - 1)))
														
 
															-    prop = self.__create_xml_element(doc, "mapred.reduce.tasks", str(numred), 
														
 
															-                                 "description")
														
 
															-    topElement.appendChild(prop)
														
 
															-    # end
														
 
															+    # servelParams
														
 
															+    if serverParams:
														
 
															+      for k, v in serverParams.iteritems():
														
 
															+        if not description.has_key(k):
														
 
															+          # if no final value for same param is mentioned
														
 
															+          description[k] = "server parameter"
														
 
															+          paramsDict[k] = v
														
 
															+
														
 
															+    # clientParams
														
 
															+    if clientParams:
														
 
															+      for k, v in clientParams.iteritems():
														
 
															+        if not description.has_key(k) or description[k] == "server parameter":
														
 
															+          # Just add, if no final value for same param is mentioned.
														
 
															+          # Replace even if server param is mentioned for same config variable
														
 
															+          description[k] = "client-side parameter"
														
 
															+          paramsDict[k] = v
														
 
															+    
														
 
															+    # generate the xml elements
														
 
															+    for k,v in paramsDict.iteritems():
														
 
															+      if ( description[k] == "final server parameter" or \
														
 
															+                             description[k] == "Hod generated parameter" ): 
														
 
															+         final = True
														
 
															+      else: final = False
														
 
															+      prop = self.__create_xml_element(doc, k, v, description[k], final)
														
 
															+      topElement.appendChild(prop)
														
 
															     siteName = os.path.join(confDir, "hadoop-site.xml")
														
 
															     sitefile = file(siteName, 'w')
														
@@ -174,44 +181,15 @@ class hadoopCluster:
 
															     return serviceData
														
 
															-  def __check_allocation_manager(self):
														
 
															-    userValid = True
														
 
															-    try:
														
 
															-      self.serviceProxyClient = hodXRClient(
														
 
															-        to_http_url(self.__cfg['hod']['proxy-xrs-address']), None, None, 0,
														
 
															-        0, 1, False, 15)
														
 
															-      
														
 
															-      userValid = self.serviceProxyClient.isProjectUserValid(
														
 
															-        self.__setup.cfg['hod']['userid'], 
														
 
															-        self.__setup.cfg['resource_manager']['pbs-account'],True)
														
 
															-      
														
 
															-      if userValid:
														
 
															-        self.__log.debug("Validated that user %s is part of project %s." %
														
 
															-          (self.__cfg['hod']['userid'], 
														
 
															-           self.__cfg['resource_manager']['pbs-account']))
														
 
															-      else:
														
 
															-        self.__log.debug("User %s is not part of project: %s." % (
														
 
															-          self.__cfg['hod']['userid'], 
														
 
															-          self.__cfg['resource_manager']['pbs-account']))
														
 
															-        self.__log.error("Please specify a valid project in "
														
 
															-                      + "resource_manager.pbs-account. If you still have "
														
 
															-                      + "issues, please contact operations")
														
 
															-        userValidd = False
														
 
															-        # ignore invalid project for now - TODO
														
 
															-    except Exception:
														
 
															-      # ignore failures - non critical for now
														
 
															-      self.__log.debug(
														
 
															-        "Unable to contact Allocation Manager Proxy - ignoring...")
														
 
															-      #userValid = False
														
 
															-        
														
 
															-    return userValid
														
 
															-
														
 
															   def __check_job_status(self):
														
 
															     initWaitCount = 20
														
 
															     count = 0
														
 
															     status = False
														
 
															     state = 'Q'
														
 
															     while state == 'Q':
														
 
															+      if hodInterrupt.isSet():
														
 
															+        raise HodInterruptException()
														
 
															+
														
 
															       state = self.__nodePool.getJobState()
														
 
															       if (state==False) or (state!='Q'):
														
 
															         break
														
@@ -241,6 +219,9 @@ class hadoopCluster:
 
															       waitTime = self.__cfg['hod']['allocate-wait-time']
														
 
															       while count < waitTime:
														
 
															+        if hodInterrupt.isSet():
														
 
															+          raise HodInterruptException()
														
 
															+
														
 
															         ringList = self.__svcrgyClient.getServiceInfo(
														
 
															           self.__cfg['ringmaster']['userid'], self.__nodePool.getServiceId(), 
														
 
															           'ringmaster', 
														
@@ -267,8 +248,11 @@ class hadoopCluster:
 
															     serviceAddress = None
														
 
															     serviceInfo = None
														
 
															-    for i in range(0, 250):
														
 
															+    for i in range(0, 250): 
														
 
															       try:
														
 
															+        if hodInterrupt.isSet():
														
 
															+            raise HodInterruptException()
														
 
															+
														
 
															         serviceAddress = xmlrpcClient.getServiceAddr(serviceName)
														
 
															         if serviceAddress:
														
 
															           if serviceAddress == 'not found':
														
@@ -280,6 +264,8 @@ class hadoopCluster:
 
															           else:
														
 
															             serviceInfo = xmlrpcClient.getURLs(serviceName)           
														
 
															             break 
														
 
															+      except HodInterruptException,h :
														
 
															+        raise h
														
 
															       except:
														
 
															         self.__log.critical("'%s': ringmaster xmlrpc error." % serviceName)
														
 
															         self.__log.debug(get_exception_string())
														
@@ -296,6 +282,8 @@ class hadoopCluster:
 
															                                             self.jobId, self.__hostname, 
														
 
															                                             serviceName, 'grid', serviceInfo)
														
 
															+      except HodInterruptException, h:
														
 
															+        raise h
														
 
															       except:
														
 
															         self.__log.critical("'%s': registry xmlrpc error." % serviceName)    
														
 
															         self.__log.debug(get_exception_string())
														
@@ -326,6 +314,8 @@ class hadoopCluster:
 
															          link):
														
 
															          for i in range(1,5):
														
 
															+           if hodInterrupt.isSet():
														
 
															+             raise HodInterruptException()
														
 
															            try:
														
 
															              input = urllib.urlopen(link)
														
 
															              break
														
@@ -385,6 +375,8 @@ class hadoopCluster:
 
															              self.__log.debug("Finished grabbing: %s" % link)
														
 
															            except AlarmException:
														
 
															+             if hodInterrupt.isSet():
														
 
															+               raise HodInterruptException()
														
 
															              if out: out.close()
														
 
															              if input: input.close()
														
@@ -403,31 +395,12 @@ class hadoopCluster:
 
															     if 'mapred' in clusterInfo:
														
 
															       mapredAddress = clusterInfo['mapred'][7:]
														
 
															       hdfsAddress = clusterInfo['hdfs'][7:]
														
 
															-  
														
 
															-      mapredSocket = tcpSocket(mapredAddress)
														
 
															-        
														
 
															-      try:
														
 
															-        mapredSocket.open()
														
 
															-        mapredSocket.close()
														
 
															-      except tcpError:
														
 
															-        status = 14
														
 
															-  
														
 
															-      hdfsSocket = tcpSocket(hdfsAddress)
														
 
															-        
														
 
															-      try:
														
 
															-        hdfsSocket.open()
														
 
															-        hdfsSocket.close()
														
 
															-      except tcpError:
														
 
															-        if status > 0:
														
 
															-          status = 10
														
 
															-        else:
														
 
															-          status = 13
														
 
															-      
														
 
															+      status = get_cluster_status(hdfsAddress, mapredAddress)
														
 
															       if status == 0:
														
 
															         status = 12
														
 
															     else:
														
 
															       status = 15
														
 
															-      
														
 
															+
														
 
															     return status
														
 
															   def cleanup(self):
														
@@ -455,37 +428,67 @@ class hadoopCluster:
 
															       self.__log.critical("Minimum nodes must be greater than 2.")
														
 
															       status = 2
														
 
															     else:
														
 
															-      if self.__check_allocation_manager():
														
 
															-        nodeSet = self.__nodePool.newNodeSet(min)
														
 
															-        self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet)
														
 
															-        if self.jobId:                 
														
 
															-          if self.__check_job_status():
														
 
															+      nodeSet = self.__nodePool.newNodeSet(min)
														
 
															+      walltime = None
														
 
															+      if self.__cfg['hod'].has_key('walltime'):
														
 
															+        walltime = self.__cfg['hod']['walltime']
														
 
															+      self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)
														
 
															+      if self.jobId:
														
 
															+        try:
														
 
															+          jobStatus = self.__check_job_status()
														
 
															+        except HodInterruptException, h:
														
 
															+          self.__log.info(HOD_INTERRUPTED_MESG)
														
 
															+          self.delete_job(self.jobId)
														
 
															+          self.__log.info("Job %s qdelled." % self.jobId)
														
 
															+          raise h
														
 
															+
														
 
															+        if jobStatus:
														
 
															+          self.__log.info("Hod Job successfully submitted. JobId : %s." \
														
 
															+                                                              % self.jobId)
														
 
															+          try:
														
 
															             self.ringmasterXRS = self.__get_ringmaster_client()
														
 
															+            
														
 
															+            self.__log.info("Ringmaster at : %s." % self.ringmasterXRS )
														
 
															+            ringClient = None
														
 
															             if self.ringmasterXRS:
														
 
															               ringClient =  hodXRClient(self.ringmasterXRS)
														
 
															-              
														
 
															+                
														
 
															               hdfsStatus, hdfsAddr, self.hdfsInfo = \
														
 
															                 self.__init_hadoop_service('hdfs', ringClient)
														
 
															-              
														
 
															+                
														
 
															               if hdfsStatus:
														
 
															+                self.__log.info("HDFS UI on http://%s" % self.hdfsInfo)
														
 
															+  
														
 
															                 mapredStatus, mapredAddr, self.mapredInfo = \
														
 
															                   self.__init_hadoop_service('mapred', ringClient)
														
 
															-                  
														
 
															+  
														
 
															                 if mapredStatus:
														
 
															-                  self.__log.info("HDFS UI on http://%s" % self.hdfsInfo)
														
 
															                   self.__log.info("Mapred UI on http://%s" % self.mapredInfo)
														
 
															- 
														
 
															+  
														
 
															+                  if self.__cfg['hod'].has_key('update-worker-info') \
														
 
															+                    and self.__cfg['hod']['update-worker-info']:
														
 
															+                    workerInfoMap = {}
														
 
															+                    workerInfoMap['HDFS UI'] = 'http://%s' % self.hdfsInfo
														
 
															+                    workerInfoMap['Mapred UI'] = 'http://%s' % self.mapredInfo
														
 
															+                    ret = self.__nodePool.updateWorkerInfo(workerInfoMap, self.jobId)
														
 
															+                    if ret != 0:
														
 
															+                      self.__log.warn('Could not update HDFS and Mapred information.' \
														
 
															+                                      'User Portal may not show relevant information.' \
														
 
															+                                      'Error code=%s' % ret)
														
 
															+  
														
 
															+                  self.__cfg.replace_escape_seqs()
														
 
															+                    
														
 
															                   # Go generate the client side hadoop-site.xml now
														
 
															                   # adding final-params as well, just so that conf on 
														
 
															                   # client-side and server-side are (almost) the same
														
 
															                   clientParams = None
														
 
															                   serverParams = {}
														
 
															                   finalServerParams = {}
														
 
															-
														
 
															+  
														
 
															                   # client-params
														
 
															                   if self.__cfg['hod'].has_key('client-params'):
														
 
															                     clientParams = self.__cfg['hod']['client-params']
														
 
															-
														
 
															+  
														
 
															                   # server-params
														
 
															                   if self.__cfg['gridservice-mapred'].has_key('server-params'):
														
 
															                     serverParams.update(\
														
@@ -494,8 +497,8 @@ class hadoopCluster:
 
															                     # note that if there are params in both mapred and hdfs
														
 
															                     # sections, the ones in hdfs overwirte the ones in mapred
														
 
															                     serverParams.update(\
														
 
															-                        self.__cfg['gridservice-mapred']['server-params'])
														
 
															-                  
														
 
															+                        self.__cfg['gridservice-hdfs']['server-params'])
														
 
															+                    
														
 
															                   # final-server-params
														
 
															                   if self.__cfg['gridservice-mapred'].has_key(\
														
 
															                                                     'final-server-params'):
														
@@ -505,9 +508,14 @@ class hadoopCluster:
 
															                                                     'final-server-params'):
														
 
															                     finalServerParams.update(\
														
 
															                         self.__cfg['gridservice-hdfs']['final-server-params'])
														
 
															-
														
 
															+  
														
 
															                   clusterFactor = self.__cfg['hod']['cluster-factor']
														
 
															-                  self.__hadoopCfg.gen_site_conf(clusterDir, min,
														
 
															+                  tempDir = self.__cfg['hod']['temp-dir']
														
 
															+                  if not os.path.exists(tempDir):
														
 
															+                    os.makedirs(tempDir)
														
 
															+                  tempDir = os.path.join( tempDir, self.__cfg['hod']['userid']\
														
 
															+                                  + "." + self.jobId )
														
 
															+                  self.__hadoopCfg.gen_site_conf(clusterDir, tempDir, min,\
														
 
															                             hdfsAddr, mapredAddr, clientParams,\
														
 
															                             serverParams, finalServerParams,\
														
 
															                             clusterFactor)
														
@@ -520,25 +528,52 @@ class hadoopCluster:
 
															               status = 6
														
 
															             if status != 0:
														
 
															               self.__log.info("Cleaning up job id %s, as cluster could not be allocated." % self.jobId)
														
 
															+              if ringClient is None:
														
 
															+                self.delete_job(self.jobId)
														
 
															+              else:
														
 
															+                self.__log.debug("Calling rm.stop()")
														
 
															+                ringClient.stopRM()
														
 
															+                self.__log.debug("Returning from rm.stop()")
														
 
															+          except HodInterruptException, h:
														
 
															+            self.__log.info(HOD_INTERRUPTED_MESG)
														
 
															+            if self.ringmasterXRS:
														
 
															+              if ringClient is None:
														
 
															+                ringClient =  hodXRClient(self.ringmasterXRS)
														
 
															+              self.__log.debug("Calling rm.stop()")
														
 
															+              ringClient.stopRM()
														
 
															+              self.__log.debug("Returning from rm.stop()")
														
 
															+              self.__log.info("Job Shutdown by informing ringmaster.")
														
 
															+            else:
														
 
															               self.delete_job(self.jobId)
														
 
															-          else:
														
 
															-            self.__log.critical("No job found, ringmaster failed to run.")
														
 
															-            status = 5 
														
 
															- 
														
 
															-        elif self.jobId == False:
														
 
															-          if exitCode == 188:
														
 
															-            self.__log.critical("Request execeeded maximum resource allocation.")
														
 
															-          else:
														
 
															-            self.__log.critical("Insufficient resources available.")
														
 
															-          status = 4
														
 
															-        else:    
														
 
															-          self.__log.critical("Scheduler failure, allocation failed.\n\n")        
														
 
															-          status = 4
														
 
															-      else:
														
 
															-        status = 9
														
 
															+              self.__log.info("Job %s qdelled directly." % self.jobId)
														
 
															+            raise h
														
 
															+        else:
														
 
															+          self.__log.critical("No job found, ringmaster failed to run.")
														
 
															+          status = 5 
														
 
															+
														
 
															+      elif self.jobId == False:
														
 
															+        if exitCode == 188:
														
 
															+          self.__log.critical("Request execeeded maximum resource allocation.")
														
 
															+        else:
														
 
															+          self.__log.critical("Insufficient resources available.")
														
 
															+        status = 4
														
 
															+      else:    
														
 
															+        self.__log.critical("Scheduler failure, allocation failed.\n\n")        
														
 
															+        status = 4
														
 
															     return status
														
 
															+  def __isRingMasterAlive(self, rmAddr):
														
 
															+    ret = True
														
 
															+    rmSocket = tcpSocket(rmAddr)
														
 
															+    try:
														
 
															+      rmSocket.open()
														
 
															+      rmSocket.close()
														
 
															+    except tcpError:
														
 
															+      ret = False
														
 
															+
														
 
															+    return ret
														
 
															+
														
 
															   def deallocate(self, clusterDir, clusterInfo):
														
 
															     status = 0 
														
@@ -546,6 +581,7 @@ class hadoopCluster:
 
															                                          id=clusterInfo['jobid'])
														
 
															     self.mapredInfo = clusterInfo['mapred']
														
 
															     self.hdfsInfo = clusterInfo['hdfs']
														
 
															+
														
 
															     try:
														
 
															       if self.__cfg['hod'].has_key('hadoop-ui-log-dir'):
														
 
															         clusterStatus = self.check_cluster(clusterInfo)
														
@@ -554,9 +590,35 @@ class hadoopCluster:
 
															           self.__collect_jobtracker_ui(self.__cfg['hod']['hadoop-ui-log-dir'])
														
 
															       else:
														
 
															         self.__log.debug('hadoop-ui-log-dir not specified. Skipping Hadoop UI log collection.')
														
 
															+    except HodInterruptException, h:
														
 
															+      # got an interrupt. just pass and proceed to qdel
														
 
															+      pass 
														
 
															     except:
														
 
															       self.__log.info("Exception in collecting Job tracker logs. Ignoring.")
														
 
															-    status = self.__nodePool.finalize()
														
 
															+    
														
 
															+    rmAddr = None
														
 
															+    if clusterInfo.has_key('ring'):
														
 
															+      # format is http://host:port/ We need host:port
														
 
															+      rmAddr = clusterInfo['ring'][7:]
														
 
															+      if rmAddr.endswith('/'):
														
 
															+        rmAddr = rmAddr[:-1]
														
 
															+
														
 
															+    if (rmAddr is None) or (not self.__isRingMasterAlive(rmAddr)):
														
 
															+      # Cluster is already dead, don't try to contact ringmaster.
														
 
															+      self.__nodePool.finalize()
														
 
															+      status = 10 # As cluster is dead, we just set the status to 'cluster dead'.
														
 
															+    else:
														
 
															+      xrsAddr = clusterInfo['ring']
														
 
															+      rmClient = hodXRClient(xrsAddr)
														
 
															+      self.__log.debug('calling rm.stop')
														
 
															+      rmClient.stopRM()
														
 
															+      self.__log.debug('completed rm.stop')
														
 
															+
														
 
															+    # cleanup hod temp dirs
														
 
															+    tempDir = os.path.join( self.__cfg['hod']['temp-dir'], \
														
 
															+                    self.__cfg['hod']['userid'] + "." + clusterInfo['jobid'] )
														
 
															+    if os.path.exists(tempDir):
														
 
															+      shutil.rmtree(tempDir)
														
 
															     return status
														
--- a/src/contrib/hod/hodlib/Hod/hod.py
+++ b/src/contrib/hod/hodlib/Hod/hod.py
@@ -15,7 +15,7 @@
 
															 #limitations under the License.
														
 
															 # -*- python -*-
														
 
															-import sys, os, getpass, pprint, re, cPickle, random, shutil
														
 
															+import sys, os, getpass, pprint, re, cPickle, random, shutil, time
														
 
															 import hodlib.Common.logger
														
@@ -23,6 +23,9 @@ from hodlib.ServiceRegistry.serviceRegistry import svcrgy
 
															 from hodlib.Common.xmlrpc import hodXRClient
														
 
															 from hodlib.Common.util import to_http_url, get_exception_string
														
 
															 from hodlib.Common.util import get_exception_error_string
														
 
															+from hodlib.Common.util import hodInterrupt, HodInterruptException
														
 
															+from hodlib.Common.util import HOD_INTERRUPTED_CODE
														
 
															+
														
 
															 from hodlib.Common.nodepoolutil import NodePoolUtil
														
 
															 from hodlib.Hod.hadoop import hadoopCluster, hadoopScript
														
@@ -115,6 +118,9 @@ class hodRunner:
 
															                                    level=self.__cfg['hod']['debug'], 
														
 
															                                    addToLoggerNames=(self.__user ,))
														
 
															+  def get_logger(self):
														
 
															+    return self.__log
														
 
															+
														
 
															   def __setup_cluster_logger(self, directory):
														
 
															     self.__baseLogger.add_file(logDirectory=directory, level=4, 
														
 
															                                addToLoggerNames=(self.__user ,))
														
@@ -124,6 +130,8 @@ class hodRunner:
 
															   def __norm_cluster_dir(self, directory):
														
 
															     directory = os.path.expanduser(directory)
														
 
															+    if not os.path.isabs(directory):
														
 
															+      directory = os.path.join(self.__cfg['hod']['original-dir'], directory)
														
 
															     directory = os.path.abspath(directory)
														
 
															     return directory
														
@@ -202,7 +210,18 @@ class hodRunner:
 
															             self.__opCode = self.__cluster.check_cluster(clusterInfo)
														
 
															             if self.__opCode == 0 or self.__opCode == 15:
														
 
															               self.__setup_service_registry()   
														
 
															-              allocateStatus = self.__cluster.allocate(clusterDir, min, max)    
														
 
															+              if hodInterrupt.isSet(): 
														
 
															+                self.__cleanup()
														
 
															+                raise HodInterruptException()
														
 
															+              self.__log.info("Service Registry Started.")
														
 
															+              try:
														
 
															+                allocateStatus = self.__cluster.allocate(clusterDir, min, max)    
														
 
															+              except HodInterruptException, h:
														
 
															+                self.__cleanup()
														
 
															+                raise h
														
 
															+              # Allocation has gone through.
														
 
															+              # Don't care about interrupts any more
														
 
															+
														
 
															               if allocateStatus == 0:
														
 
															                 self.__set_cluster_state_info(os.environ, 
														
 
															                                               self.__cluster.hdfsInfo, 
														
@@ -213,6 +232,8 @@ class hodRunner:
 
															                 self.__setup_cluster_state(clusterDir)
														
 
															                 self.__clusterState.write(self.__cluster.jobId, 
														
 
															                                           self.__clusterStateInfo)
														
 
															+                #  Do we need to check for interrupts here ??
														
 
															+
														
 
															                 self.__set_user_state_info( 
														
 
															                   { clusterDir : self.__cluster.jobId, } )
														
 
															               self.__opCode = allocateStatus
														
@@ -239,7 +260,15 @@ class hodRunner:
 
															       self.__log.critical("%s operation requires two arguments. "  % operation
														
 
															                         + "A cluster path and n nodes, or min-max nodes.")
														
 
															       self.__opCode = 3
														
 
															-  
														
 
															+ 
														
 
															+  def _is_cluster_allocated(self, clusterDir):
														
 
															+    if os.path.isdir(clusterDir):
														
 
															+      self.__setup_cluster_state(clusterDir)
														
 
															+      clusterInfo = self.__clusterState.read()
														
 
															+      if clusterInfo != {}:
														
 
															+        return True
														
 
															+    return False
														
 
															+
														
 
															   def _op_deallocate(self, args):
														
 
															     operation = "deallocate"
														
 
															     argLength = len(args)
														
@@ -293,25 +322,19 @@ class hodRunner:
 
															         clusterStatus = self.__cluster.check_cluster(clusterInfo)
														
 
															         if clusterStatus == 12:
														
 
															           self.__log.info(clusterDir)
														
 
															-          keys = clusterInfo.keys()
														
 
															-          keys.sort()
														
 
															-          for key in keys:
														
 
															-            if key != 'env':
														
 
															-              self.__log.info("%s\t%s" % (key, clusterInfo[key]))  
														
 
															-            
														
 
															-          if self.__cfg['hod']['debug'] == 4:
														
 
															-            for var in clusterInfo['env'].keys():
														
 
															-              self.__log.debug("%s = %s" % (var, clusterInfo['env'][var]))
														
 
															+          self.__print_cluster_info(clusterInfo)
														
 
															         elif clusterStatus == 10:
														
 
															           self.__log.critical("%s cluster is dead" % clusterDir)
														
 
															         elif clusterStatus == 13:
														
 
															           self.__log.warn("%s cluster hdfs is dead" % clusterDir)
														
 
															         elif clusterStatus == 14:
														
 
															           self.__log.warn("%s cluster mapred is dead" % clusterDir)
														
 
															-        
														
 
															+
														
 
															         if clusterStatus != 12:
														
 
															           if clusterStatus == 15:
														
 
															             self.__log.critical("Cluster %s not allocated." % clusterDir)
														
 
															+          else:
														
 
															+            self.__print_cluster_info(clusterInfo)
														
 
															           self.__opCode = clusterStatus
														
 
															       else:
														
@@ -321,7 +344,19 @@ class hodRunner:
 
															       self.__log.critical("%s operation requires one argument. "  % operation
														
 
															                         + "A cluster path.")
														
 
															       self.__opCode = 3      
														
 
															-  
														
 
															+ 
														
 
															+  def __print_cluster_info(self, clusterInfo):
														
 
															+    keys = clusterInfo.keys()
														
 
															+    keys.sort()
														
 
															+    for key in keys:
														
 
															+      if key != 'env':
														
 
															+        self.__log.info("%s\t%s" % (key, clusterInfo[key]))  
														
 
															+            
														
 
															+    if self.__cfg['hod']['debug'] == 4:
														
 
															+      for var in clusterInfo['env'].keys():
														
 
															+        self.__log.debug("%s = %s" % (var, clusterInfo['env'][var]))
														
 
															+
														
 
															+ 
														
 
															   def _op_help(self, args):  
														
 
															     print "hod operations:\n"
														
 
															     print " allocate <directory> <nodes> - Allocates a cluster of n nodes using the specified cluster"
														
@@ -342,6 +377,10 @@ class hodRunner:
 
															       opList = self.__check_operation(operation)
														
 
															       if self.__opCode == 0:
														
 
															         getattr(self, "_op_%s" % opList[0])(opList)
														
 
															+    except HodInterruptException, h:
														
 
															+      self.__log.critical("op: %s failed because of an process interrupt." \
														
 
															+                                                                % operation)
														
 
															+      self.__opCode = HOD_INTERRUPTED_CODE
														
 
															     except:
														
 
															       self.__log.critical("op: %s failed: %s" % (operation,
														
 
															                           get_exception_error_string()))
														
@@ -356,16 +395,41 @@ class hodRunner:
 
															   def script(self):
														
 
															     script = self.__cfg['hod']['script']
														
 
															     nodes = self.__cfg['hod']['min-nodes']
														
 
															+    isExecutable = os.access(script, os.X_OK)
														
 
															+    if not isExecutable:
														
 
															+      self.__log.critical('Script %s is not an executable.' % script)
														
 
															+      return 1
														
 
															+
														
 
															     clusterDir = "/tmp/%s.%s" % (self.__cfg['hod']['userid'], 
														
 
															                                  random.randint(0, 20000))
														
 
															     os.mkdir(clusterDir)
														
 
															+    ret = 0
														
 
															     try:
														
 
															       self._op_allocate(('allocate', clusterDir, str(nodes)))
														
 
															-      scriptRunner = hadoopScript(clusterDir, 
														
 
															+      if self.__opCode == 0:
														
 
															+        if self.__cfg['hod'].has_key('script-wait-time'):
														
 
															+          time.sleep(self.__cfg['hod']['script-wait-time'])
														
 
															+          self.__log.debug('Slept for %d time. Now going to run the script' % self.__cfg['hod']['script-wait-time'])
														
 
															+        if hodInterrupt.isSet():
														
 
															+          self.__log.debug('Interrupt set - not executing script')
														
 
															+        else:
														
 
															+          scriptRunner = hadoopScript(clusterDir, 
														
 
															                                   self.__cfg['hod']['original-dir'])
														
 
															-      self.__opCode = scriptRunner.run(script)
														
 
															-      self._op_deallocate(('deallocate', clusterDir))
														
 
															+          self.__opCode = scriptRunner.run(script)
														
 
															+          ret = self.__opCode
														
 
															+          self.__log.debug("Exit code from running the script: %d" % self.__opCode)
														
 
															+      else:
														
 
															+        self.__log.critical("Error %d in allocating the cluster. Cannot run the script." % self.__opCode)
														
 
															+
														
 
															+      if hodInterrupt.isSet():
														
 
															+        # Got interrupt while executing script. Unsetting it for deallocating
														
 
															+        hodInterrupt.setFlag(False)
														
 
															+      if self._is_cluster_allocated(clusterDir):
														
 
															+        self._op_deallocate(('deallocate', clusterDir))
														
 
															       shutil.rmtree(clusterDir, True)
														
 
															+    except HodInterruptException, h:
														
 
															+      self.__log.critical("Script failed because of an process interrupt.")
														
 
															+      self.__opCode = HOD_INTERRUPTED_CODE
														
 
															     except:
														
 
															       self.__log.critical("script: %s failed: %s" % (script,
														
 
															                           get_exception_error_string()))
														
@@ -373,4 +437,8 @@ class hodRunner:
 
															     self.__cleanup()      
														
 
															+    # We want to give importance to a failed script's exit code.
														
 
															+    if ret != 0:
														
 
															+      self.__opCode = ret
														
 
															+
														
 
															     return self.__opCode
														
--- a/src/contrib/hod/hodlib/Hod/nodePool.py
+++ b/src/contrib/hod/hodlib/Hod/nodePool.py
@@ -108,6 +108,10 @@ class NodePool:
 
															     """Delete a job, given it's id"""
														
 
															     raise NotImplementedError
														
 
															+  def updateWorkerInfo(self, workerInfoMap):
														
 
															+    """Update information about the workers started by this NodePool."""
														
 
															+    raise NotImplementedError
														
 
															+
														
 
															   def getNextNodeSetId(self):
														
 
															     id = self.nextNodeSetId
														
 
															     self.nextNodeSetId += 1
														
--- a/src/contrib/hod/hodlib/HodRing/hodRing.py
+++ b/src/contrib/hod/hodlib/HodRing/hodRing.py
@@ -19,13 +19,14 @@
 
															 """
														
 
															 # -*- python -*-
														
 
															 import os, sys, time, shutil, getpass, xml.dom.minidom, xml.dom.pulldom
														
 
															-import socket, sets, urllib, csv, signal, pprint, random, re
														
 
															+import socket, sets, urllib, csv, signal, pprint, random, re, httplib
														
 
															 from xml.dom import getDOMImplementation
														
 
															 from pprint import pformat
														
 
															 from optparse import OptionParser
														
 
															 from urlparse import urlparse
														
 
															-from hodlib.Common.util import local_fqdn
														
 
															+from hodlib.Common.util import local_fqdn, parseEquals
														
 
															+from hodlib.Common.tcp import tcpSocket, tcpError 
														
 
															 binfile = sys.path[0]
														
 
															 libdir = os.path.dirname(binfile)
														
@@ -53,6 +54,7 @@ class CommandDesc:
 
															     self.log.debug("In command desc")
														
 
															     self.log.debug("Done in command desc")
														
 
															     dict.setdefault('argv', [])
														
 
															+    dict.setdefault('version', None)
														
 
															     dict.setdefault('envs', {})
														
 
															     dict.setdefault('java-opts', [])
														
 
															     dict.setdefault('workdirs', [])
														
@@ -83,6 +85,9 @@ class CommandDesc:
 
															   def getArgv(self):
														
 
															     return self.dict['argv']
														
 
															+  def getVersion(self):
														
 
															+    return self.dict['version']
														
 
															+
														
 
															   def getEnvs(self):
														
 
															     return self.dict['envs']
														
@@ -243,9 +248,13 @@ class HadoopCommand:
 
															     topElement = doc.documentElement
														
 
															     topElement.appendChild(comment)
														
 
															-    attr = self.desc.getfinalAttrs()
														
 
															-    self.createXML(doc, attr, topElement, True)
														
 
															-    attr = self.desc.getAttrs()
														
 
															+    finalAttr = self.desc.getfinalAttrs()
														
 
															+    self.createXML(doc, finalAttr, topElement, True)
														
 
															+    attr = {}
														
 
															+    attr1 = self.desc.getAttrs()
														
 
															+    for k,v in attr1.iteritems():
														
 
															+      if not finalAttr.has_key(k):
														
 
															+        attr[k] = v
														
 
															     self.createXML(doc, attr, topElement, False)
														
@@ -306,7 +315,7 @@ class HadoopCommand:
 
															     fenvs = os.environ
														
 
															     for k, v in envs.iteritems():
														
 
															-      fenvs[k] = v[0]
														
 
															+      fenvs[k] = v
														
 
															     self.log.debug(javaOpts)
														
 
															     fenvs['HADOOP_OPTS'] = ''
														
@@ -440,6 +449,15 @@ class HodRing(hodBaseService):
 
															     self.log.debug("tarball name : %s hadoop package name : %s" %(name,hadoopPackage))
														
 
															     return hadoopPackage
														
 
															+  def getRunningValues(self):
														
 
															+    return self.__running.values()
														
 
															+
														
 
															+  def getTempDir(self):
														
 
															+    return self.__tempDir
														
 
															+
														
 
															+  def getHadoopLogDirs(self):
														
 
															+    return self.__hadoopLogDirs
														
 
															+ 
														
 
															   def __download_package(self, ringClient):
														
 
															     self.log.debug("Found download address: %s" % 
														
 
															                    self._cfg['download-addr'])
														
@@ -523,6 +541,75 @@ class HodRing(hodBaseService):
 
															         continue
														
 
															       self.__running[id-1] = cmd
														
 
															+      # ok.. now command is running. If this HodRing got jobtracker, 
														
 
															+      # Check if it is ready for accepting jobs, and then only return
														
 
															+      self.__check_jobtracker(desc, id-1)
														
 
															+      
														
 
															+  def __check_jobtracker(self, desc, id):
														
 
															+    # Check jobtracker status. Return properly if it is ready to accept jobs.
														
 
															+    # Currently Checks for Jetty to come up, the last thing that can be checked
														
 
															+    # before JT completes initialisation. To be perfectly reliable, we need 
														
 
															+    # hadoop support
														
 
															+    name = desc.getName()
														
 
															+    if name == 'jobtracker':
														
 
															+      # Yes I am the Jobtracker
														
 
															+      self.log.debug("Waiting for jobtracker to initialise")
														
 
															+      version = desc.getVersion()
														
 
															+      self.log.debug("jobtracker version : %s" % version)
														
 
															+      attrs = self.getRunningValues()[id].getFilledInKeyValues()
														
 
															+      attrs = parseEquals(attrs)
														
 
															+      jobTrackerAddr = attrs['mapred.job.tracker']
														
 
															+      self.log.debug("jobtracker rpc server : %s" % jobTrackerAddr)
														
 
															+      if version < 16:
														
 
															+        jettyAddr = jobTrackerAddr.split(':')[0] + ':' + \
														
 
															+                              attrs['mapred.job.tracker.info.port']
														
 
															+      else:
														
 
															+        jettyAddr = attrs['mapred.job.tracker.http.bindAddress']
														
 
															+      self.log.debug("Jobtracker jetty : %s" % jettyAddr)
														
 
															+
														
 
															+      # Check for Jetty to come up
														
 
															+      # For this do a http head, and then look at the status
														
 
															+      defaultTimeout = socket.getdefaulttimeout()
														
 
															+      # socket timeout isn`t exposed at httplib level. Setting explicitly.
														
 
															+      socket.setdefaulttimeout(1)
														
 
															+      sleepTime = 0.5
														
 
															+      jettyStatus = False
														
 
															+      jettyStatusmsg = ""
														
 
															+      while sleepTime <= 32:
														
 
															+        try:
														
 
															+          jettyConn = httplib.HTTPConnection(jettyAddr)
														
 
															+          jettyConn.request("HEAD", "/jobtracker.jsp")
														
 
															+          # httplib inherently retries the following till socket timeout
														
 
															+          resp = jettyConn.getresponse()
														
 
															+          if resp.status != 200:
														
 
															+            # Some problem?
														
 
															+            jettyStatus = False
														
 
															+            jettyStatusmsg = "Jetty gave a non-200 response to a HTTP-HEAD" +\
														
 
															+                             " request. HTTP Status (Code, Msg): (%s, %s)" % \
														
 
															+                             ( resp.status, resp.reason )
														
 
															+            break
														
 
															+          else:
														
 
															+            self.log.info("Jetty returned a 200 status (%s)" % resp.reason)
														
 
															+            self.log.info("JobTracker successfully initialised")
														
 
															+            return
														
 
															+        except socket.error:
														
 
															+          self.log.debug("Jetty gave a socket error. Sleeping for %s" \
														
 
															+                                                                  % sleepTime)
														
 
															+          time.sleep(sleepTime)
														
 
															+          sleepTime = sleepTime * 2
														
 
															+        except Exception, e:
														
 
															+          jettyStatus = False
														
 
															+          jettyStatusmsg = ("Process(possibly other than jetty) running on" + \
														
 
															+                  " port assigned to jetty is returning invalid http response")
														
 
															+          break
														
 
															+      socket.setdefaulttimeout(defaultTimeout)
														
 
															+      if not jettyStatus:
														
 
															+        self.log.critical("Jobtracker failed to initialise.")
														
 
															+        if jettyStatusmsg:
														
 
															+          self.log.critical( "Reason: %s" % jettyStatusmsg )
														
 
															+        else: self.log.critical( "Reason: Jetty failed to give response")
														
 
															+        raise Exception("JobTracker failed to initialise")
														
 
															+
														
 
															   def stop(self):
														
 
															     self.log.debug("Entered hodring stop.")
														
 
															     if self._http: 
														
@@ -532,153 +619,12 @@ class HodRing(hodBaseService):
 
															     self.log.debug("call hodsvcrgy stop...")
														
 
															     hodBaseService.stop(self)
														
 
															-    self.clean_up()
														
 
															-    
														
 
															-  def clean_up(self):
														
 
															-    os.chdir(originalcwd)
														
 
															-    if not mswindows:
														
 
															-      # do the UNIX double-fork magic, see Stevens' "Advanced 
														
 
															-      # Programming in the UNIX Environment" for details (ISBN 0201563177)
														
 
															-      try: 
														
 
															-        pid = os.fork() 
														
 
															-        if pid > 0:
														
 
															-          # exit first parent
														
 
															-          sys.exit(0) 
														
 
															-      except OSError, e: 
														
 
															-        self.log.error("fork #1 failed: %d (%s)" % (e.errno, e.strerror)) 
														
 
															-        sys.exit(1)
														
 
															-
														
 
															-      # decouple from parent environment
														
 
															-      os.chdir("/") 
														
 
															-      os.setsid() 
														
 
															-      os.umask(0) 
														
 
															-
														
 
															-      # do second fork
														
 
															-      try: 
														
 
															-        pid = os.fork() 
														
 
															-        if pid > 0:
														
 
															-          # exit from second parent, print eventual PID before
														
 
															-          sys.exit(0) 
														
 
															-      except OSError, e: 
														
 
															-        self.log.error("fork #2 failed: %d (%s)" % (e.errno, e.strerror))
														
 
															-        sys.exit(1) 
														
 
															-
														
 
															-    try:
														
 
															-#      for cmd in self.__running.values():
														
 
															-#        self.log.debug("killing %s..." % cmd)
														
 
															-#        cmd.kill()
														
 
															-  
														
 
															-      list = []
														
 
															-      
														
 
															-      for cmd in self.__running.values():
														
 
															-        self.log.debug("addding %s to cleanup list..." % cmd)
														
 
															-        cmd.addCleanup(list)
														
 
															-      
														
 
															-      list.append(self.__tempDir)
														
 
															-         
														
 
															-      self.__archive_logs()   
														
 
															-          
														
 
															-      for dir in list:
														
 
															-        if os.path.exists(dir):
														
 
															-          self.log.debug('removing %s' % (dir))
														
 
															-          shutil.rmtree(dir, True)
														
 
															-    except:
														
 
															-      self.log.error(get_exception_string())
														
 
															-    sys.exit(0)
														
 
															-
														
 
															   def _xr_method_clusterStart(self, initialize=True):
														
 
															     return self.clusterStart(initialize)
														
 
															   def _xr_method_clusterStop(self):
														
 
															     return self.clusterStop()
														
 
															-  def __copy_archive_to_dfs(self, archiveFile):        
														
 
															-    hdfsURIMatch = reHdfsURI.match(self._cfg['log-destination-uri'])
														
 
															-    
														
 
															-    # FIXME this is a complete and utter hack. Currently hadoop is broken
														
 
															-    # and does not understand hdfs:// syntax on the command line :(
														
 
															-    
														
 
															-    pid = os.getpid()
														
 
															-    tempConfDir = '/tmp/%s' % pid
														
 
															-    os.mkdir(tempConfDir)
														
 
															-    tempConfFileName = '%s/hadoop-site.xml' % tempConfDir
														
 
															-    tempHadoopConfig = open(tempConfFileName, 'w')
														
 
															-    print >>tempHadoopConfig, "<configuration>"
														
 
															-    print >>tempHadoopConfig, "  <property>"
														
 
															-    print >>tempHadoopConfig, "    <name>fs.default.name</name>"
														
 
															-    print >>tempHadoopConfig, "    <value>%s</value>" % hdfsURIMatch.group(1)
														
 
															-    print >>tempHadoopConfig, "    <description>No description</description>"
														
 
															-    print >>tempHadoopConfig, "  </property>"
														
 
															-    print >>tempHadoopConfig, "</configuration>"
														
 
															-    tempHadoopConfig.close()
														
 
															-    
														
 
															-    # END LAME HACK
														
 
															-    
														
 
															-    (head, tail) = os.path.split(archiveFile)
														
 
															-    destFile = os.path.join(hdfsURIMatch.group(2), self._cfg['userid'], 
														
 
															-                            self._cfg['service-id'], tail)
														
 
															-    
														
 
															-    self.log.info("copying archive %s to DFS %s ..." % (archiveFile, destFile))
														
 
															-    
														
 
															-    runningHadoops = self.__running.values()
														
 
															-    if (len(runningHadoops) == 0):
														
 
															-      self.log.info("len(runningHadoops) == 0, No running cluster?")
														
 
															-      self.log.info("Skipping __copy_archive_to_dfs")
														
 
															-      return
														
 
															- 
														
 
															-    run = runningHadoops[0]
														
 
															-    hadoopCmd = run.path
														
 
															-    if self._cfg.has_key('pkgs'):
														
 
															-      hadoopCmd = os.path.join(self._cfg['pkgs'], 'bin', 'hadoop')
														
 
															-
														
 
															-    # LAME HACK AGAIN, using config generated above :( 
														
 
															-    copyCommand = "%s --config %s dfs -copyFromLocal %s %s" % (hadoopCmd, 
														
 
															-      tempConfDir, archiveFile, destFile)
														
 
															-    
														
 
															-    self.log.debug(copyCommand)
														
 
															-    
														
 
															-    copyThread = simpleCommand('hadoop', copyCommand)
														
 
															-    copyThread.start()
														
 
															-    copyThread.wait()
														
 
															-    copyThread.join()
														
 
															-    self.log.debug(pprint.pformat(copyThread.output()))
														
 
															-    
														
 
															-    # LAME HACK AGAIN, deleting config generated above :( 
														
 
															-    os.unlink(tempConfFileName)
														
 
															-    os.rmdir(tempConfDir)
														
 
															-    os.unlink(archiveFile)
														
 
															-  
														
 
															-  def __archive_logs(self):
														
 
															-    status = True
														
 
															-    if self._cfg.has_key("log-destination-uri"):
														
 
															-      try:
														
 
															-        if self.__hadoopLogDirs:
														
 
															-          date = time.localtime()
														
 
															-          for logDir in self.__hadoopLogDirs:
														
 
															-            (head, tail) = os.path.split(logDir)
														
 
															-            (head, logType) = os.path.split(head)
														
 
															-            tarBallFile = "%s-%s-%04d%02d%02d%02d%02d%02d-%s.tar.gz" % (
														
 
															-              logType, local_fqdn(), date[0], date[1], date[2], date[3], 
														
 
															-              date[4], date[5], random.randint(0,1000))
														
 
															-            
														
 
															-            if self._cfg["log-destination-uri"].startswith('file://'):
														
 
															-              tarBallFile = os.path.join(self._cfg["log-destination-uri"][7:], 
														
 
															-                                         tarBallFile)
														
 
															-            else:
														
 
															-              tarBallFile = os.path.join(self._cfg['temp-dir'], tarBallFile)
														
 
															-            
														
 
															-            self.log.info('archiving log files to: %s' % tarBallFile)
														
 
															-            status = tar(tarBallFile, logDir, ['*',])
														
 
															-            self.log.info('archive %s status: %s' % (tarBallFile, status))
														
 
															-            if status and \
														
 
															-              self._cfg["log-destination-uri"].startswith('hdfs://'):
														
 
															-              self.__copy_archive_to_dfs(tarBallFile)
														
 
															-          dict = {} 
														
 
															-      except:
														
 
															-        self.log.error(get_exception_string())
														
 
															-      
														
 
															-    return status
														
 
															-      
														
 
															   def start(self):
														
 
															     """Run and maintain hodring commands"""
														
--- a/src/contrib/hod/hodlib/NodePools/torque.py
+++ b/src/contrib/hod/hodlib/NodePools/torque.py
@@ -150,7 +150,8 @@ class TorquePool(NodePool):
 
															         break
														
 
															       argList.extend(process_qsub_attributes())
														
 
															-      argList.extend(('-N', 'HOD'))
														
 
															+
														
 
															+      argList.extend(('-N', '"' + self._cfg['hod']['title'] + '"'))
														
 
															       argList.extend(('-r','n'))
														
 
															       if 'pbs-user' in self._cfg['resource_manager']:
														
@@ -161,9 +162,11 @@ class TorquePool(NodePool):
 
															         queue = self._cfg['resource_manager']['queue']
														
 
															         argList.extend(('-q',queue))
														
 
															-      # accounting should recognize userid:pbs-account as being "charged"
														
 
															-      argList.extend(('-A', (self._cfg['hod']['userid'] + ':' + 
														
 
															-                   self._cfg['resource_manager']['pbs-account'])))
														
 
															+      # In HOD 0.4, we pass in an account string only if it is mentioned.
														
 
															+      # Also, we don't append userid to the account string, as HOD jobs run as the 
														
 
															+      # user running them, not as 'HOD' user.
														
 
															+      if self._cfg['resource_manager'].has_key('pbs-account'):
														
 
															+        argList.extend(('-A', (self._cfg['resource_manager']['pbs-account'])))
														
 
															       if 'env-vars' in self._cfg['resource_manager']:
														
 
															         qsub_envs = self._cfg['resource_manager']['env-vars']
														
@@ -177,7 +180,7 @@ class TorquePool(NodePool):
 
															   def __keyValToString(self, keyValList):
														
 
															     ret = ""
														
 
															     for key in keyValList:
														
 
															-      ret = "%s%s=%s," % (ret, key, keyValList[key][0])
														
 
															+      ret = "%s%s=%s," % (ret, key, keyValList[key])
														
 
															     return ret[:-1]
														
 
															   def newNodeSet(self, numNodes, preferred=[], isPreemptee=True, id=None):
														
@@ -288,5 +291,10 @@ class TorquePool(NodePool):
 
															   def runWorkers(self, args):
														
 
															     return self.__torque.pbsdsh(args)
														
 
															-
														
 
															+  def updateWorkerInfo(self, workerInfoMap, jobId):
														
 
															+    workerInfoStr = ''
														
 
															+    for key in workerInfoMap.keys():
														
 
															+      workerInfoStr = '%s,%s:%s' % (workerInfoStr, key, workerInfoMap[key])
														
 
															+    exitCode = self.__torque.qalter("notes", workerInfoStr[1:], jobId)
														
 
															+    return exitCode
														
--- a/src/contrib/hod/hodlib/RingMaster/idleJobTracker.py
+++ b/src/contrib/hod/hodlib/RingMaster/idleJobTracker.py
@@ -16,7 +16,20 @@
 
															 import os, re, time
														
 
															 from hodlib.Common.threads import loop, func
														
 
															 from hodlib.Common.threads import simpleCommand
														
 
															-from hodlib.Common.util import get_exception_string
														
 
															+from hodlib.Common.util import get_exception_string, hadoopVersion
														
 
															+
														
 
															+class HadoopJobStatus:
														
 
															+  """This class represents the status of a single Hadoop job"""
														
 
															+  
														
 
															+  def __init__(self, jobId, status):
														
 
															+    self.__jobId = jobId
														
 
															+    self.__status = status
														
 
															+
														
 
															+  def getJobId(self):
														
 
															+    return self.__jobId
														
 
															+
														
 
															+  def getStatus(self):
														
 
															+    return self.__status
														
 
															 class JobTrackerMonitor:
														
 
															   """This class monitors the JobTracker of an allocated cluster
														
@@ -39,9 +52,11 @@ class JobTrackerMonitor:
 
															     # The service info provider will be polled until we get the URL.
														
 
															     self.__serviceInfoProvider = servInfoProvider
														
 
															     self.__jobCountRegExp = re.compile("([0-9]+) jobs currently running.*")
														
 
															+    self.__jobStatusRegExp = re.compile("(\S+)\s+(\d)\s+\d+\s+\S+$")
														
 
															     self.__firstIdleTime = 0
														
 
															+    self.__hadoop15Version = { 'major' : '0', 'minor' : '15' }
														
 
															     #Assumption: we are not going to support versions older than 0.15 for Idle Job tracker.
														
 
															-    if not self.__isCompatibleHadoopVersion():
														
 
															+    if not self.__isCompatibleHadoopVersion(self.__hadoop15Version):
														
 
															       raise Exception('Incompatible Hadoop Version: Cannot check status')
														
 
															     self.__stopFlag = False
														
 
															     self.__jtURLFinderThread = func(name='JTURLFinderThread', functionRef=self.getJobTrackerURL)
														
@@ -87,6 +102,36 @@ class JobTrackerMonitor:
 
															     except:
														
 
															       self.__log.debug('Exception while monitoring job tracker. %s' % get_exception_string())
														
 
															+  def getJobsStatus(self):
														
 
															+    """This method should return the status of all jobs that are run on the HOD allocated
														
 
															+       hadoop cluster"""
														
 
															+    jobStatusList = []
														
 
															+    try:
														
 
															+      hadoop16Version = { 'major' : '0', 'minor' : '16' }
														
 
															+      if self.__isCompatibleHadoopVersion(hadoop16Version):
														
 
															+        jtStatusCommand = self.__initStatusCommand(option='-list all')
														
 
															+        jtStatusCommand.start()
														
 
															+        jtStatusCommand.wait()
														
 
															+        jtStatusCommand.join()
														
 
															+        if jtStatusCommand.exit_code() == 0:
														
 
															+          for line in jtStatusCommand.output():
														
 
															+            jobStatus = self.__extractJobStatus(line)
														
 
															+            if jobStatus is not None:
														
 
															+              jobStatusList.append(jobStatus)
														
 
															+    except:
														
 
															+      self.__log.debug('Exception while getting job statuses. %s' % get_exception_string())
														
 
															+    return jobStatusList
														
 
															+
														
 
															+  def __extractJobStatus(self, line):
														
 
															+    """This method parses an output line from the job status command and creates
														
 
															+       the JobStatus object if there is a match"""
														
 
															+    jobStatus = None
														
 
															+    line = line.strip()
														
 
															+    jsMatch = self.__jobStatusRegExp.match(line)
														
 
															+    if jsMatch:
														
 
															+      jobStatus = HadoopJobStatus(jsMatch.group(1), int(jsMatch.group(2)))
														
 
															+    return jobStatus
														
 
															+
														
 
															   def __isIdle(self):
														
 
															     """This method checks if the JobTracker is idle beyond a certain limit."""
														
 
															     if self.__getJobCount() == 0:
														
@@ -121,47 +166,25 @@ class JobTrackerMonitor:
 
															           jobs = int(match.group(1))
														
 
															     return jobs
														
 
															-  def __findHadoopVersion(self):
														
 
															-    """This method determines the version of hadoop being used by executing the 
														
 
															-       hadoop version command"""
														
 
															-    verMap = { 'major' : None, 'minor' : None }
														
 
															-    hadoopPath = os.path.join(self.__hadoopDir, 'bin', 'hadoop')
														
 
															-    cmd = "%s version" % hadoopPath
														
 
															-    self.__log.debug('Executing command %s to find hadoop version' % cmd)
														
 
															-    env = os.environ
														
 
															-    env['JAVA_HOME'] = self.__javaHome
														
 
															-    hadoopVerCmd = simpleCommand('HadoopVersion', cmd, env)
														
 
															-    hadoopVerCmd.start()
														
 
															-    hadoopVerCmd.wait()
														
 
															-    hadoopVerCmd.join()
														
 
															-    if hadoopVerCmd.exit_code() == 0:
														
 
															-      verLine = hadoopVerCmd.output()[0]
														
 
															-      self.__log.debug('Version from hadoop command: %s' % verLine)
														
 
															-      hadoopVerRegExp = re.compile("Hadoop ([0-9]+)\.([0-9]+).*")
														
 
															-      verMatch = hadoopVerRegExp.match(verLine)
														
 
															-      if verMatch != None:
														
 
															-        verMap['major'] = verMatch.group(1)
														
 
															-        verMap['minor'] = verMatch.group(2)
														
 
															-
														
 
															-    return verMap
														
 
															-
														
 
															-  def __isCompatibleHadoopVersion(self):
														
 
															+  def __isCompatibleHadoopVersion(self, expectedVersion):
														
 
															     """This method determines whether the version of hadoop being used is one that 
														
 
															-       provides the hadoop job -list command or not"""
														
 
															-    ver = self.__findHadoopVersion()
														
 
															+       is higher than the expectedVersion.
														
 
															+       This can be used for checking if a particular feature is available or not"""
														
 
															+    ver = hadoopVersion(self.__hadoopDir, self.__javaHome, self.__log)
														
 
															     ret = False
														
 
															-    if (ver['major']!=None) and (int(ver['major']) >= 0) \
														
 
															-      and (ver['minor']!=None) and (int(ver['minor']) >= 15):
														
 
															+    if (ver['major']!=None) and (int(ver['major']) >= int(expectedVersion['major'])) \
														
 
															+      and (ver['minor']!=None) and (int(ver['minor']) >= int(expectedVersion['minor'])):
														
 
															       ret = True
														
 
															-
														
 
															     return ret
														
 
															-  def __initStatusCommand(self):
														
 
															+  def __initStatusCommand(self, option="-list"):
														
 
															     """This method initializes the command to run to check the JT status"""
														
 
															     cmd = None
														
 
															     hadoopPath = os.path.join(self.__hadoopDir, 'bin', 'hadoop')
														
 
															-    cmdStr = "%s job -jt %s -list" % (hadoopPath, self.__jobTrackerURL)
														
 
															+    cmdStr = "%s job -jt %s" % (hadoopPath, self.__jobTrackerURL)
														
 
															+    cmdStr = "%s %s" % (cmdStr, option)
														
 
															+    self.__log.debug('cmd str %s' % cmdStr)
														
 
															     env = os.environ
														
 
															     env['JAVA_HOME'] = self.__javaHome
														
 
															     cmd = simpleCommand('HadoopStatus', cmdStr, env)
														
--- a/src/contrib/hod/hodlib/RingMaster/ringMaster.py
+++ b/src/contrib/hod/hodlib/RingMaster/ringMaster.py
@@ -28,7 +28,7 @@ libdir = os.path.dirname(binfile)
 
															 sys.path.append(libdir)
														
 
															 import hodlib.Common.logger
														
 
															-from hodlib.RingMaster.idleJobTracker import JobTrackerMonitor
														
 
															+from hodlib.RingMaster.idleJobTracker import JobTrackerMonitor, HadoopJobStatus
														
 
															 from hodlib.Common.threads import func 
														
@@ -484,7 +484,20 @@ class _LogMasterSources:
 
															     return addr
														
 
															-  
														
 
															+  def stopRM(self):
														
 
															+    """An XMLRPC call which will spawn a thread to stop the Ringmaster program."""
														
 
															+    # We spawn a thread here because we want the XMLRPC call to return. Calling
														
 
															+    # stop directly from here will also stop the XMLRPC server.
														
 
															+    try:
														
 
															+      self.log.debug("inside xml-rpc call to stop ringmaster")
														
 
															+      rmStopperThread = func('RMStopper', self.rm.stop)
														
 
															+      rmStopperThread.start()
														
 
															+      self.log.debug("returning from xml-rpc call to stop ringmaster")
														
 
															+      return True
														
 
															+    except:
														
 
															+      self.log.debug("Exception in stop: %s" % get_exception_string())
														
 
															+      return False
														
 
															+
														
 
															 class RingMaster:
														
 
															   def __init__(self, cfg, log, **kwds):
														
 
															     """starts nodepool and services"""
														
@@ -499,6 +512,8 @@ class RingMaster:
 
															     self.__jtMonitor = None
														
 
															     self.__idlenessDetected = False
														
 
															     self.__stopInProgress = False
														
 
															+    self.__isStopped = False # to let main exit
														
 
															+    self.__exitCode = 0 # exit code with which the ringmaster main method should return
														
 
															     self.__initialize_signal_handlers()
														
@@ -544,23 +559,33 @@ class RingMaster:
 
															       hdfsDesc = sdl['hdfs']
														
 
															       hdfs = None
														
 
															+ 
														
 
															+      # Determine hadoop Version
														
 
															+      hadoopVers = hadoopVersion(self.__getHadoopDir(), \
														
 
															+                                self.cfg['hodring']['java-home'], self.log)
														
 
															+      
														
 
															       if hdfsDesc.isExternal():
														
 
															-        hdfs = HdfsExternal(hdfsDesc, workDirs)
														
 
															+        hdfs = HdfsExternal(hdfsDesc, workDirs, version=int(hadoopVers['minor']))
														
 
															+        hdfs.setMasterParams( self.cfg['gridservice-hdfs'] )
														
 
															       else:
														
 
															-        hdfs = Hdfs(hdfsDesc, workDirs, 0)
														
 
															+        hdfs = Hdfs(hdfsDesc, workDirs, 0, version=int(hadoopVers['minor']))
														
 
															       self.serviceDict[hdfs.getName()] = hdfs
														
 
															       mrDesc = sdl['mapred']
														
 
															       mr = None
														
 
															       if mrDesc.isExternal():
														
 
															-        mr = MapReduceExternal(mrDesc, workDirs)
														
 
															+        mr = MapReduceExternal(mrDesc, workDirs, version=int(hadoopVers['minor']))
														
 
															+        mr.setMasterParams( self.cfg['gridservice-mapred'] )
														
 
															       else:
														
 
															-        mr = MapReduce(mrDesc, workDirs,1)
														
 
															+        mr = MapReduce(mrDesc, workDirs,1, version=int(hadoopVers['minor']))
														
 
															       self.serviceDict[mr.getName()] = mr
														
 
															     except:
														
 
															-      self.log.debug(get_exception_string)
														
 
															+      self.log.critical("Exception in creating Hdfs and Map/Reduce descriptor objects: \
														
 
															+                            %s." % get_exception_error_string())
														
 
															+      self.log.debug(get_exception_string())
														
 
															+      raise
														
 
															     # should not be starting these in a constructor
														
 
															     ringMasterServer.startService(self.serviceDict, cfg, self.np, log, self)
														
@@ -860,23 +885,74 @@ class RingMaster:
 
															     self._finalize()
														
 
															+  def __findExitCode(self):
														
 
															+    """Determine the exit code based on the status of the cluster or jobs run on them"""
														
 
															+    xmlrpcServer = ringMasterServer.instance.logMasterSources
														
 
															+    if xmlrpcServer.getServiceAddr('hdfs') == 'not found':
														
 
															+      self.__exitCode = 7
														
 
															+    elif xmlrpcServer.getServiceAddr('mapred') == 'not found':
														
 
															+      self.__exitCode = 8
														
 
															+    else:
														
 
															+      clusterStatus = get_cluster_status(xmlrpcServer.getServiceAddr('hdfs'),
														
 
															+                                          xmlrpcServer.getServiceAddr('mapred'))
														
 
															+      if clusterStatus != 0:
														
 
															+        self.__exitCode = clusterStatus
														
 
															+      else:
														
 
															+        self.__exitCode = self.__findHadoopJobsExitCode()
														
 
															+    self.log.debug('exit code %s' % self.__exitCode)
														
 
															+
														
 
															+  def __findHadoopJobsExitCode(self):
														
 
															+    """Determine the consolidate exit code of hadoop jobs run on this cluster, provided
														
 
															+       this information is available. Return 0 otherwise"""
														
 
															+    ret = 0
														
 
															+    failureStatus = 3
														
 
															+    failureCount = 0
														
 
															+    if self.__jtMonitor:
														
 
															+      jobStatusList = self.__jtMonitor.getJobsStatus()
														
 
															+      try:
														
 
															+        if len(jobStatusList) > 0:
														
 
															+          for jobStatus in jobStatusList:
														
 
															+            self.log.debug('job status for %s: %s' % (jobStatus.getJobId(), 
														
 
															+                                                      jobStatus.getStatus()))
														
 
															+            if jobStatus.getStatus() == failureStatus:
														
 
															+              failureCount = failureCount+1
														
 
															+        if failureCount > 0:
														
 
															+          if failureCount == len(jobStatusList): # all jobs failed
														
 
															+            ret = 16
														
 
															+          else:
														
 
															+            ret = 17
														
 
															+      except:
														
 
															+        self.log.debug('exception in finding hadoop jobs exit code' % get_exception_string())
														
 
															+    return ret
														
 
															+
														
 
															   def stop(self):
														
 
															     self.log.debug("RingMaster stop method invoked.")
														
 
															-    if self.__stopInProgress:
														
 
															+    if self.__stopInProgress or self.__isStopped:
														
 
															       return
														
 
															     self.__stopInProgress = True
														
 
															-    if self.__jtMonitor is not None:
														
 
															-      self.__jtMonitor.stop()
														
 
															     if ringMasterServer.instance is not None:
														
 
															+      self.log.debug('finding exit code')
														
 
															+      self.__findExitCode()
														
 
															       self.log.debug('stopping ringmaster instance')
														
 
															       ringMasterServer.stopService()
														
 
															+    else:
														
 
															+      self.__exitCode = 6
														
 
															+    if self.__jtMonitor is not None:
														
 
															+      self.__jtMonitor.stop()
														
 
															     if self.httpServer:
														
 
															       self.httpServer.stop()
														
 
															     self.__clean_up()
														
 
															+    self.__isStopped = True
														
 
															+
														
 
															+  def shouldStop(self):
														
 
															+    """Indicates whether the main loop should exit, either due to idleness condition, 
														
 
															+    or a stop signal was received"""
														
 
															+    return self.__idlenessDetected or self.__isStopped
														
 
															-  def isClusterIdle(self):
														
 
															-    return self.__idlenessDetected
														
 
															+  def getExitCode(self):
														
 
															+    """return the exit code of the program"""
														
 
															+    return self.__exitCode
														
 
															 def main(cfg,log):
														
 
															   try:
														
@@ -885,10 +961,11 @@ def main(cfg,log):
 
															     cfg = dGen.initializeDesc()
														
 
															     rm = RingMaster(cfg, log)
														
 
															     rm.start()
														
 
															-    while not rm.isClusterIdle():
														
 
															+    while not rm.shouldStop():
														
 
															       time.sleep(1)
														
 
															     rm.stop()
														
 
															     log.debug('returning from main')
														
 
															+    return rm.getExitCode()
														
 
															   except Exception, e:
														
 
															     if log:
														
 
															       log.critical(get_exception_string())
														
--- a/src/contrib/hod/hodlib/Schedulers/torque.py
+++ b/src/contrib/hod/hodlib/Schedulers/torque.py
@@ -28,6 +28,7 @@ class torqueInterface:
 
															     self.__qstat = os.path.join(torqueDir, 'bin', 'qstat')
														
 
															     self.__pbsNodes = os.path.join(torqueDir, 'bin', 'pbsnodes')
														
 
															     self.__pbsdsh = os.path.join(torqueDir, 'bin', 'pbsdsh')
														
 
															+    self.__qalter = os.path.join(torqueDir, 'bin', 'qalter')
														
 
															     self.__env = environment
														
 
															     self.__log = log
														
@@ -48,11 +49,23 @@ class torqueInterface:
 
															     while qsubProcess.stdin == None:
														
 
															       time.sleep(.2)
														
 
															-    for line in stdinList:
														
 
															-      self.__log.debug("qsub stdin: %s" % line)
														
 
															-      print >>qsubProcess.stdin, line
														
 
															+    try:
														
 
															+      for line in stdinList:
														
 
															+        self.__log.debug("qsub stdin: %s" % line)
														
 
															+        print >>qsubProcess.stdin, line
														
 
															+      qsubProcess.stdin.close()
														
 
															+    except IOError, i:
														
 
															+      # If torque's qsub is given invalid params, it fails & returns immediately
														
 
															+      # Check for such errors here
														
 
															+      # Wait for command execution to finish
														
 
															+      qsubProcess.wait()
														
 
															+      qsubProcess.join()
														
 
															+      output = qsubProcess.output()
														
 
															+      if output!=[]:
														
 
															+        self.__log.critical("qsub Failure : %s " % output[0].strip())
														
 
															+        self.__log.critical("qsub Command : %s" % qsubCommand)
														
 
															+      return None, qsubProcess.exit_code()
														
 
															-    qsubProcess.stdin.close()
														
 
															     qsubProcess.wait()
														
 
															     qsubProcess.join()
														
@@ -145,3 +158,18 @@ class torqueInterface:
 
															     if not status: status = 0
														
 
															     return status  
														
 
															+
														
 
															+  def qalter(self, fieldName, fieldValue, jobId):
														
 
															+    """Update the job field with fieldName with the fieldValue.
														
 
															+       The fieldValue must be modifiable after the job is submitted."""
														
 
															+
														
 
															+    # E.g. to alter comment: qalter -W notes='value` jobId
														
 
															+    qalterCmd = '%s -W %s=\"%s\" %s' % (self.__qalter, fieldName, fieldValue, jobId) 
														
 
															+    self.__log.debug("qalter command: %s" % qalterCmd)
														
 
															+    qalterProcess = simpleCommand('qalter', qalterCmd, env=self.__env)
														
 
															+    qalterProcess.start()
														
 
															+    qalterProcess.wait()
														
 
															+    qalterProcess.join()
														
 
															+    exitCode = qalterProcess.exit_code()
														
 
															+
														
 
															+    return exitCode
														
--- a/src/docs/src/documentation/content/xdocs/hod.xml
+++ b/src/docs/src/documentation/content/xdocs/hod.xml
@@ -144,7 +144,7 @@
 
															               <em>Twisted Python:</em> This can be used for improving the scalability of HOD. Twisted Python is available <a href="http://twistedmatrix.com/trac/">here</a>.
														
 
															             </li>
														
 
															             <li>
														
 
															-            <em>Hadoop:</em> HOD can automatically distribute Hadoop to all nodes in the cluster. However, it can also use a pre-installed version of Hadoop, if it is available on all nodes in the cluster. HOD currently supports only Hadoop 0.16, which is under development.
														
 
															+            <em>Hadoop:</em> HOD can automatically distribute Hadoop to all nodes in the cluster. However, it can also use a pre-installed version of Hadoop, if it is available on all nodes in the cluster. HOD currently supports Hadoop 0.15 and above.
														
 
															             </li>
														
 
															           </ul>
														
 
															           <p>