Browse Source

AMBARI-11624 - Datanode Shutdown Retries During Upgrade Are Too Long (jonathanhurley)

Jonathan Hurley 10 years ago
parent
commit
b40d808d3a

+ 11 - 2
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py

@@ -62,18 +62,27 @@ def post_upgrade_check():
   _check_datanode_startup()
   _check_datanode_startup()
 
 
 
 
-@retry(times=12, sleep_time=10, err_class=Fail)
+@retry(times=24, sleep_time=5, err_class=Fail)
 def _check_datanode_shutdown():
 def _check_datanode_shutdown():
   """
   """
   Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo"
   Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo"
   several times, pausing in between runs. Once the DataNode stops responding
   several times, pausing in between runs. Once the DataNode stops responding
   this method will return, otherwise it will raise a Fail(...) and retry
   this method will return, otherwise it will raise a Fail(...) and retry
   automatically.
   automatically.
+  The stack defaults for retrying for HDFS are also way too slow for this
+  command; they are set to wait about 45 seconds between client retries. As
+  a result, a single execution of dfsadmin will take 45 seconds to retry and
+  the DataNode may be marked as dead, causing problems with HBase.
+  https://issues.apache.org/jira/browse/HDFS-8510 tracks reducing the
+  times for ipc.client.connect.retry.interval. In the meantime, override them
+  here, but only for RU.
   :return:
   :return:
   """
   """
   import params
   import params
 
 
-  command = format('hdfs dfsadmin -getDatanodeInfo {dfs_dn_ipc_address}')
+  # override stock retry timeouts since after 30 seconds, the datanode is
+  # marked as dead and can affect HBase during RU
+  command = format('hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo {dfs_dn_ipc_address}')
 
 
   try:
   try:
     Execute(command, user=params.hdfs_user, tries=1)
     Execute(command, user=params.hdfs_user, tries=1)

+ 27 - 0
ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py

@@ -544,6 +544,33 @@ class TestDatanode(RMFTestCase):
       self.assertTrue(mocks_dict['call'].called)
       self.assertTrue(mocks_dict['call'].called)
       self.assertEqual(mocks_dict['call'].call_count,12)
       self.assertEqual(mocks_dict['call'].call_count,12)
 
 
+
+  @patch('time.sleep')
+  def test_stop_during_upgrade(self, time_mock):
+    config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
+    with open(config_file, "r") as f:
+      json_content = json.load(f)
+
+    version = '2.2.1.0-3242'
+    json_content['commandParams']['version'] = version
+
+    try:
+      self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/datanode.py",
+        classname = "DataNode",
+        command = "stop",
+        config_dict = json_content,
+        hdp_stack_version = self.STACK_VERSION,
+        target = RMFTestCase.TARGET_COMMON_SERVICES,
+        command_args=[True])
+
+      raise Fail("Expected a fail since datanode didn't report a shutdown")
+    except:
+      pass
+
+    self.assertResourceCalled('Execute', 'hdfs dfsadmin -shutdownDatanode 0.0.0.0:8010 upgrade', user="hdfs", tries=1)
+    self.assertResourceCalled('Execute', 'hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010', user="hdfs", tries=1)
+
+
   @patch("resource_management.libraries.functions.security_commons.build_expectations")
   @patch("resource_management.libraries.functions.security_commons.build_expectations")
   @patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem")
   @patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem")
   @patch("resource_management.libraries.functions.security_commons.validate_security_config_properties")
   @patch("resource_management.libraries.functions.security_commons.validate_security_config_properties")