Browse Source

AMBARI-11624 - Datanode Shutdown Retries During Upgrade Are Too Long (jonathanhurley)

Jonathan Hurley 10 years ago
parent
commit
b40d808d3a

+ 11 - 2
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py

@@ -62,18 +62,27 @@ def post_upgrade_check():
   _check_datanode_startup()
 
 
-@retry(times=12, sleep_time=10, err_class=Fail)
+@retry(times=24, sleep_time=5, err_class=Fail)
 def _check_datanode_shutdown():
   """
   Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo"
   several times, pausing in between runs. Once the DataNode stops responding
   this method will return, otherwise it will raise a Fail(...) and retry
   automatically.
+  The stack defaults for retrying for HDFS are also way too slow for this
+  command; they are set to wait about 45 seconds between client retries. As
+  a result, a single execution of dfsadmin will take 45 seconds to retry and
+  the DataNode may be marked as dead, causing problems with HBase.
+  https://issues.apache.org/jira/browse/HDFS-8510 tracks reducing the
+  times for ipc.client.connect.retry.interval. In the meantime, override them
+  here, but only for RU.
   :return:
   """
   import params
 
-  command = format('hdfs dfsadmin -getDatanodeInfo {dfs_dn_ipc_address}')
+  # override stock retry timeouts since after 30 seconds, the datanode is
+  # marked as dead and can affect HBase during RU
+  command = format('hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo {dfs_dn_ipc_address}')
 
   try:
     Execute(command, user=params.hdfs_user, tries=1)

+ 27 - 0
ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py

@@ -544,6 +544,33 @@ class TestDatanode(RMFTestCase):
       self.assertTrue(mocks_dict['call'].called)
       self.assertEqual(mocks_dict['call'].call_count,12)
 
+
+  @patch('time.sleep')
+  def test_stop_during_upgrade(self, time_mock):
+    config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
+    with open(config_file, "r") as f:
+      json_content = json.load(f)
+
+    version = '2.2.1.0-3242'
+    json_content['commandParams']['version'] = version
+
+    try:
+      self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/datanode.py",
+        classname = "DataNode",
+        command = "stop",
+        config_dict = json_content,
+        hdp_stack_version = self.STACK_VERSION,
+        target = RMFTestCase.TARGET_COMMON_SERVICES,
+        command_args=[True])
+
+      raise Fail("Expected a fail since datanode didn't report a shutdown")
+    except:
+      pass
+
+    self.assertResourceCalled('Execute', 'hdfs dfsadmin -shutdownDatanode 0.0.0.0:8010 upgrade', user="hdfs", tries=1)
+    self.assertResourceCalled('Execute', 'hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010', user="hdfs", tries=1)
+
+
   @patch("resource_management.libraries.functions.security_commons.build_expectations")
   @patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem")
   @patch("resource_management.libraries.functions.security_commons.validate_security_config_properties")