Ver código fonte

AMBARI-15991. DataNode and RegionServer during upgrade are reported as "failed" incorrectly (Daniel Gergely via oleewere)

Daniel Gergely 9 anos atrás
pai
commit
7232ea8dc2

+ 18 - 3
ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py

@@ -19,18 +19,18 @@ limitations under the License.
 
 """
 from resource_management import *
-from resource_management.core.resources.system import Execute
 from resource_management.core import shell
 from resource_management.libraries.functions import conf_select
 from resource_management.libraries.functions import stack_select
 from resource_management.libraries.functions import StackFeature
 from resource_management.libraries.functions.stack_features import check_stack_feature
 from resource_management.libraries.functions.decorator import retry
+from resource_management.libraries.functions import check_process_status
 
 def prestart(env, stack_component):
   import params
 
-  if params.version and check_stack_feature(StackFeature.ROLLING_UPGRADE, params.version):    
+  if params.version and check_stack_feature(StackFeature.ROLLING_UPGRADE, params.version):
     conf_select.select(params.stack_name, "hbase", params.version)
     stack_select.select(stack_component, params.version)
 
@@ -41,7 +41,22 @@ def post_regionserver(env):
   check_cmd = "echo 'status \"simple\"' | {0} shell".format(params.hbase_cmd)
 
   exec_cmd = "{0} {1}".format(params.kinit_cmd, check_cmd)
-  call_and_match(exec_cmd, params.hbase_user, params.hostname + ":", re.IGNORECASE)
+  _wait_for_region_server_to_start(exec_cmd, params.hbase_user, params.hostname + ":", re.IGNORECASE)
+
+@retry(times=3, sleep_time=300, err_class=Fail)
+def _wait_for_region_server_to_start(cmd, user, regex, regex_search_flags):
+  if not is_region_server_process_running():
+    Logger.info("RegionServer process is not running")
+    raise Fail("RegionServer process is not running")
+  call_and_match(cmd, user, regex, regex_search_flags)
+
+def is_region_server_process_running():
+  try:
+    pid_file = format("{pid_dir}/hbase-{hbase_user}-regionserver.pid")
+    check_process_status(pid_file)
+    return True
+  except ComponentIsNotRunning:
+    return False
 
 @retry(times=15, sleep_time=2, err_class=Fail)
 def call_and_match(cmd, user, regex, regex_search_flags):

+ 17 - 0
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py

@@ -24,6 +24,8 @@ from resource_management.core.resources.system import Execute
 from resource_management.core import shell
 from resource_management.libraries.functions import format
 from resource_management.libraries.functions.decorator import retry
+from resource_management.libraries.functions import check_process_status
+from resource_management.core import ComponentIsNotRunning
 from utils import get_dfsadmin_base_command
 
 
@@ -71,9 +73,24 @@ def post_upgrade_check(hdfs_binary):
     Execute(params.dn_kinit_cmd, user=params.hdfs_user)
 
   # verify that the datanode has started and rejoined the HDFS cluster
+  _wait_for_datanode_to_join(hdfs_binary)
+
+@retry(times=3, sleep_time=300, err_class=Fail)
+def _wait_for_datanode_to_join(hdfs_binary):
+  if not is_datanode_process_running():
+    Logger.info("DataNode process is not running")
+    raise Fail("DataNode process is not running")
   _check_datanode_startup(hdfs_binary)
 
 
+def is_datanode_process_running():
+  import params
+  try:
+    check_process_status(params.datanode_pid_file)
+    return True
+  except ComponentIsNotRunning:
+    return False
+
 @retry(times=24, sleep_time=5, err_class=Fail)
 def _check_datanode_shutdown(hdfs_binary):
   """

+ 1 - 0
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py

@@ -74,6 +74,7 @@ root_user = "root"
 hadoop_pid_dir_prefix = status_params.hadoop_pid_dir_prefix
 namenode_pid_file = status_params.namenode_pid_file
 zkfc_pid_file = status_params.zkfc_pid_file
+datanode_pid_file = status_params.datanode_pid_file
 
 # Some datanode settings
 dfs_dn_addr = default('/configurations/hdfs-site/dfs.datanode.address', None)

+ 17 - 15
ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py

@@ -18,9 +18,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 '''
 import json
+import upgrade
 from mock.mock import MagicMock, patch
 from stacks.utils.RMFTestCase import *
 
+@patch.object(upgrade, 'check_process_status', new = MagicMock())
 @patch("platform.linux_distribution", new = MagicMock(return_value="Linux"))
 @patch("os.path.exists", new = MagicMock(return_value=True))
 class TestHbaseRegionServer(RMFTestCase):
@@ -36,10 +38,10 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assert_configure_default()
     self.assertNoMoreResources()
-    
+
   def test_start_default(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -48,14 +50,14 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assert_configure_default()
     self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf start regionserver',
       not_if = 'ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1',
       user = 'hbase'
     )
     self.assertNoMoreResources()
-    
+
   def test_stop_default(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -64,19 +66,19 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver',
         only_if = 'ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1',
         on_timeout = '! ( ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1 ) || ambari-sudo.sh -H -E kill -9 `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid`',
         timeout = 30,
         user = 'hbase',
     )
-    
+
     self.assertResourceCalled('File', '/var/run/hbase/hbase-hbase-regionserver.pid',
         action = ['delete'],
     )
     self.assertNoMoreResources()
-    
+
   def test_configure_secured(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -85,10 +87,10 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assert_configure_secured()
     self.assertNoMoreResources()
-    
+
   def test_start_secured(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -97,14 +99,14 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assert_configure_secured()
     self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf start regionserver',
       not_if = 'ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1',
       user = 'hbase',
     )
     self.assertNoMoreResources()
-    
+
   def test_stop_secured(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -120,7 +122,7 @@ class TestHbaseRegionServer(RMFTestCase):
         timeout = 30,
         user = 'hbase',
     )
-    
+
     self.assertResourceCalled('File', '/var/run/hbase/hbase-hbase-regionserver.pid',
         action = ['delete'],
     )
@@ -329,7 +331,7 @@ class TestHbaseRegionServer(RMFTestCase):
                    config_file="hbase-rs-2.2.json",
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES)
-    
+
     self.assertResourceCalled('Directory', '/etc/hbase',
       mode = 0755)
 
@@ -675,8 +677,8 @@ class TestHbaseRegionServer(RMFTestCase):
                               ('ambari-python-wrap', '/usr/bin/hdp-select', 'set', 'hbase-regionserver', version), sudo=True,)
     self.assertNoMoreResources()
 
-
-  def test_post_rolling_restart(self):
+  @patch('time.sleep')
+  def test_post_rolling_restart(self, sleep_mock):
     config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
     with open(config_file, "r") as f:
       json_content = json.load(f)

+ 7 - 7
ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py

@@ -25,7 +25,7 @@ from resource_management.core import shell
 from resource_management.core.exceptions import Fail
 import resource_management.libraries.functions.dfs_datanode_helper
 
-
+@patch.object(resource_management.libraries.functions, 'check_process_status', new = MagicMock())
 class TestDatanode(RMFTestCase):
   COMMON_SERVICES_PACKAGE_DIR = "HDFS/2.1.0.2.0/package"
   STACK_VERSION = "2.0.6"
@@ -515,10 +515,10 @@ class TestDatanode(RMFTestCase):
                        config_file = "default.json",
                        stack_version = self.STACK_VERSION,
                        target = RMFTestCase.TARGET_COMMON_SERVICES,
-                       call_mocks = [(0, shell_call_output)],
+                       call_mocks = [(0, shell_call_output)] * 3,
                        mocks_dict = mocks_dict
     )
-    
+
     self.assertTrue(mocks_dict['call'].called)
     self.assertEqual(mocks_dict['call'].call_count,1)
 
@@ -535,13 +535,13 @@ class TestDatanode(RMFTestCase):
                          config_file = "default.json",
                          stack_version = self.STACK_VERSION,
                          target = RMFTestCase.TARGET_COMMON_SERVICES,
-                         call_mocks = [(0, 'There are no DataNodes here!')],
+                         call_mocks = [(0, 'There are no DataNodes here!')] * 36,
                          mocks_dict = mocks_dict
       )
       self.fail('Missing DataNode should have caused a failure')
     except Fail,fail:
       self.assertTrue(mocks_dict['call'].called)
-      self.assertEqual(mocks_dict['call'].call_count,12)
+      self.assertEqual(mocks_dict['call'].call_count,36)
 
 
   @patch("socket.gethostbyname")
@@ -556,13 +556,13 @@ class TestDatanode(RMFTestCase):
                          config_file = "default.json",
                          stack_version = self.STACK_VERSION,
                          target = RMFTestCase.TARGET_COMMON_SERVICES,
-                         call_mocks = [(0, 'some')],
+                         call_mocks = [(1, 'some')] * 36,
                          mocks_dict = mocks_dict
       )
       self.fail('Invalid return code should cause a failure')
     except Fail,fail:
       self.assertTrue(mocks_dict['call'].called)
-      self.assertEqual(mocks_dict['call'].call_count,12)
+      self.assertEqual(mocks_dict['call'].call_count,36)
 
 
   @patch("resource_management.core.shell.call")