Browse Source

AMBARI-5108. HBaseRegionServer requires multiple retries to be stopped
during reassigning NameNode after EnablingHA (aonishuk)

Andrew Onischuk 11 years ago
parent
commit
00fe3df41b

+ 10 - 1
ambari-agent/src/main/python/resource_management/core/providers/system.py

@@ -27,6 +27,7 @@ import os
 import pwd
 import pwd
 import time
 import time
 import shutil
 import shutil
+from subprocess import TimeoutExpired
 from resource_management.core import shell
 from resource_management.core import shell
 from resource_management.core.base import Fail
 from resource_management.core.base import Fail
 from resource_management.core.providers import Provider
 from resource_management.core.providers import Provider
@@ -231,7 +232,7 @@ class ExecuteProvider(Provider):
         shell.checked_call(self.resource.command, logoutput=self.resource.logoutput,
         shell.checked_call(self.resource.command, logoutput=self.resource.logoutput,
                             cwd=self.resource.cwd, env=self.resource.environment,
                             cwd=self.resource.cwd, env=self.resource.environment,
                             preexec_fn=_preexec_fn(self.resource), user=self.resource.user,
                             preexec_fn=_preexec_fn(self.resource), user=self.resource.user,
-                            wait_for_finish=self.resource.wait_for_finish)
+                            wait_for_finish=self.resource.wait_for_finish, timeout=self.resource.timeout)
         break
         break
       except Fail as ex:
       except Fail as ex:
         if i == self.resource.tries-1: # last try
         if i == self.resource.tries-1: # last try
@@ -239,6 +240,14 @@ class ExecuteProvider(Provider):
         else:
         else:
           Logger.info("Retrying after %d seconds. Reason: %s" % (self.resource.try_sleep, str(ex)))
           Logger.info("Retrying after %d seconds. Reason: %s" % (self.resource.try_sleep, str(ex)))
           time.sleep(self.resource.try_sleep)
           time.sleep(self.resource.try_sleep)
+      except TimeoutExpired:
+        err_msg = ("Execution of '%s' was killed due timeout after %d seconds") % (self.resource.command, self.resource.timeout)
+        
+        if self.resource.on_timeout:
+          Logger.info("Executing '%s'. Reason: %s" % (self.resource.on_timeout, err_msg))
+          shell.checked_call(self.resource.on_timeout)
+        else:
+          raise Fail(err_msg)
        
        
 
 
 class ExecuteScriptProvider(Provider):
 class ExecuteScriptProvider(Provider):

+ 6 - 0
ambari-agent/src/main/python/resource_management/core/resources/system.py

@@ -85,6 +85,12 @@ class Execute(Resource):
   actions = Resource.actions + ["run"]
   actions = Resource.actions + ["run"]
   logoutput = BooleanArgument(default=False)
   logoutput = BooleanArgument(default=False)
   """
   """
+  if on_timeout is not set leads to failing after x seconds,
+  otherwise calls on_timeout
+  """
+  timeout = ResourceArgument() # seconds
+  on_timeout = ResourceArgument()
+  """
   Wait for command to finish or not. 
   Wait for command to finish or not. 
   
   
   NOTE:
   NOTE:

+ 14 - 7
ambari-agent/src/main/python/resource_management/core/shell.py

@@ -24,20 +24,21 @@ __all__ = ["checked_call", "call"]
 
 
 import subprocess
 import subprocess
 import pipes
 import pipes
+from subprocess import TimeoutExpired
 from exceptions import Fail
 from exceptions import Fail
 from resource_management.core.logger import Logger
 from resource_management.core.logger import Logger
 
 
 def checked_call(command, logoutput=False, 
 def checked_call(command, logoutput=False, 
-         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True):
-  return _call(command, logoutput, True, cwd, env, preexec_fn, user, wait_for_finish)
+         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True, timeout=None):
+  return _call(command, logoutput, True, cwd, env, preexec_fn, user, wait_for_finish, timeout)
 
 
 def call(command, logoutput=False, 
 def call(command, logoutput=False, 
-         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True):
-  return _call(command, logoutput, False, cwd, env, preexec_fn, user, wait_for_finish)
+         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True, timeout=None):
+  return _call(command, logoutput, False, cwd, env, preexec_fn, user, wait_for_finish, timeout)
   
   
 
 
 def _call(command, logoutput=False, throw_on_failure=True, 
 def _call(command, logoutput=False, throw_on_failure=True, 
-         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True):
+         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True, timeout=None):
   """
   """
   Execute shell command
   Execute shell command
   
   
@@ -63,8 +64,14 @@ def _call(command, logoutput=False, throw_on_failure=True,
 
 
   if not wait_for_finish:
   if not wait_for_finish:
     return None, None
     return None, None
+  
 
 
-  out = proc.communicate()[0].strip('\n')
+  try:
+    out = proc.communicate(timeout=timeout)[0].strip('\n')
+  except TimeoutExpired as ex:
+    proc.terminate()
+    raise ex
+    
   code = proc.returncode
   code = proc.returncode
   
   
   if logoutput and out:
   if logoutput and out:
@@ -74,4 +81,4 @@ def _call(command, logoutput=False, throw_on_failure=True,
     err_msg = ("Execution of '%s' returned %d. %s") % (command[-1], code, out)
     err_msg = ("Execution of '%s' returned %d. %s") % (command[-1], code, out)
     raise Fail(err_msg)
     raise Fail(err_msg)
   
   
-  return code, out
+  return code, out

+ 13 - 8
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py

@@ -29,18 +29,23 @@ def hbase_service(
     role = name
     role = name
     cmd = format("{daemon_script} --config {conf_dir}")
     cmd = format("{daemon_script} --config {conf_dir}")
     pid_file = format("{pid_dir}/hbase-{hbase_user}-{role}.pid")
     pid_file = format("{pid_dir}/hbase-{hbase_user}-{role}.pid")
-    
-    daemon_cmd = None
-    no_op_test = None
+    no_op_test = format("ls {pid_file} >/dev/null 2>&1 && ps `cat {pid_file}` >/dev/null 2>&1")
     
     
     if action == 'start':
     if action == 'start':
       daemon_cmd = format("{cmd} start {role}")
       daemon_cmd = format("{cmd} start {role}")
-      no_op_test = format("ls {pid_file} >/dev/null 2>&1 && ps `cat {pid_file}` >/dev/null 2>&1")
-    elif action == 'stop':
-      daemon_cmd = format("{cmd} stop {role} && rm -f {pid_file}")
-
-    if daemon_cmd is not None:
+      
       Execute ( daemon_cmd,
       Execute ( daemon_cmd,
         not_if = no_op_test,
         not_if = no_op_test,
         user = params.hbase_user
         user = params.hbase_user
       )
       )
+    elif action == 'stop':
+      daemon_cmd = format("{cmd} stop {role}")
+
+      Execute ( daemon_cmd,
+        user = params.hbase_user,
+        # BUGFIX: hbase regionserver sometimes hangs when nn is in safemode
+        timeout = 30,
+        on_timeout = format("{no_op_test} && kill -9 `cat {pid_file}`")
+      )
+      
+      Execute (format("rm -f {pid_file}"))

+ 12 - 4
ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py

@@ -53,9 +53,13 @@ class TestHBaseMaster(RMFTestCase):
                    config_file="default.json"
                    config_file="default.json"
     )
     )
     
     
-    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop master && rm -f /var/run/hbase/hbase-hbase-master.pid',
-      not_if = None,
+    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop master',
       user = 'hbase',
       user = 'hbase',
+      on_timeout = 'ls /var/run/hbase/hbase-hbase-master.pid >/dev/null 2>&1 && ps `cat /var/run/hbase/hbase-hbase-master.pid` >/dev/null 2>&1 && kill -9 `cat /var/run/hbase/hbase-hbase-master.pid`', 
+      timeout = 30,
+    )
+    
+    self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-master.pid',
     )
     )
     self.assertNoMoreResources()
     self.assertNoMoreResources()
 
 
@@ -136,9 +140,13 @@ class TestHBaseMaster(RMFTestCase):
                    config_file="secured.json"
                    config_file="secured.json"
     )
     )
 
 
-    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop master && rm -f /var/run/hbase/hbase-hbase-master.pid',
-      not_if = None,
+    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop master',
       user = 'hbase',
       user = 'hbase',
+      on_timeout = 'ls /var/run/hbase/hbase-hbase-master.pid >/dev/null 2>&1 && ps `cat /var/run/hbase/hbase-hbase-master.pid` >/dev/null 2>&1 && kill -9 `cat /var/run/hbase/hbase-hbase-master.pid`', 
+      timeout = 30,
+    )
+    
+    self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-master.pid',
     )
     )
     self.assertNoMoreResources()
     self.assertNoMoreResources()
 
 

+ 12 - 4
ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py

@@ -53,9 +53,13 @@ class TestHbaseRegionServer(RMFTestCase):
                    config_file="default.json"
                    config_file="default.json"
     )
     )
     
     
-    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver && rm -f /var/run/hbase/hbase-hbase-regionserver.pid',
-      not_if = None,
+    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver',
       user = 'hbase',
       user = 'hbase',
+      on_timeout = 'ls /var/run/hbase/hbase-hbase-regionserver.pid >/dev/null 2>&1 && ps `cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1 && kill -9 `cat /var/run/hbase/hbase-hbase-regionserver.pid`', 
+      timeout = 30,
+    )
+    
+    self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-regionserver.pid',
     )
     )
     self.assertNoMoreResources()
     self.assertNoMoreResources()
     
     
@@ -90,9 +94,13 @@ class TestHbaseRegionServer(RMFTestCase):
                    config_file="secured.json"
                    config_file="secured.json"
     )
     )
 
 
-    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver && rm -f /var/run/hbase/hbase-hbase-regionserver.pid',
-      not_if = None,
+    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver',
       user = 'hbase',
       user = 'hbase',
+      on_timeout = 'ls /var/run/hbase/hbase-hbase-regionserver.pid >/dev/null 2>&1 && ps `cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1 && kill -9 `cat /var/run/hbase/hbase-hbase-regionserver.pid`', 
+      timeout = 30,
+    )
+    
+    self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-regionserver.pid',
     )
     )
     self.assertNoMoreResources()
     self.assertNoMoreResources()