10 years ago · b40d808d3a
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
@@ -62,18 +62,27 @@ def post_upgrade_check():
 
															   _check_datanode_startup()
														
 
															-@retry(times=12, sleep_time=10, err_class=Fail)
														
 
															+@retry(times=24, sleep_time=5, err_class=Fail)
														
 
															 def _check_datanode_shutdown():
														
 
															   """
														
 
															   Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo"
														
 
															   several times, pausing in between runs. Once the DataNode stops responding
														
 
															   this method will return, otherwise it will raise a Fail(...) and retry
														
 
															   automatically.
														
 
															+  The stack defaults for retrying for HDFS are also way too slow for this
														
 
															+  command; they are set to wait about 45 seconds between client retries. As
														
 
															+  a result, a single execution of dfsadmin will take 45 seconds to retry and
														
 
															+  the DataNode may be marked as dead, causing problems with HBase.
														
 
															+  https://issues.apache.org/jira/browse/HDFS-8510 tracks reducing the
														
 
															+  times for ipc.client.connect.retry.interval. In the meantime, override them
														
 
															+  here, but only for RU.
														
 
															   :return:
														
 
															   """
														
 
															   import params
														
 
															-  command = format('hdfs dfsadmin -getDatanodeInfo {dfs_dn_ipc_address}')
														
 
															+  # override stock retry timeouts since after 30 seconds, the datanode is
														
 
															+  # marked as dead and can affect HBase during RU
														
 
															+  command = format('hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo {dfs_dn_ipc_address}')
														
 
															   try:
														
 
															     Execute(command, user=params.hdfs_user, tries=1)
														
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
@@ -544,6 +544,33 @@ class TestDatanode(RMFTestCase):
 
															       self.assertTrue(mocks_dict['call'].called)
														
 
															       self.assertEqual(mocks_dict['call'].call_count,12)
														
 
															+
														
 
															+  @patch('time.sleep')
														
 
															+  def test_stop_during_upgrade(self, time_mock):
														
 
															+    config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
														
 
															+    with open(config_file, "r") as f:
														
 
															+      json_content = json.load(f)
														
 
															+
														
 
															+    version = '2.2.1.0-3242'
														
 
															+    json_content['commandParams']['version'] = version
														
 
															+
														
 
															+    try:
														
 
															+      self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/datanode.py",
														
 
															+        classname = "DataNode",
														
 
															+        command = "stop",
														
 
															+        config_dict = json_content,
														
 
															+        hdp_stack_version = self.STACK_VERSION,
														
 
															+        target = RMFTestCase.TARGET_COMMON_SERVICES,
														
 
															+        command_args=[True])
														
 
															+
														
 
															+      raise Fail("Expected a fail since datanode didn't report a shutdown")
														
 
															+    except:
														
 
															+      pass
														
 
															+
														
 
															+    self.assertResourceCalled('Execute', 'hdfs dfsadmin -shutdownDatanode 0.0.0.0:8010 upgrade', user="hdfs", tries=1)
														
 
															+    self.assertResourceCalled('Execute', 'hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010', user="hdfs", tries=1)
														
 
															+
														
 
															+
														
 
															   @patch("resource_management.libraries.functions.security_commons.build_expectations")
														
 
															   @patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem")
														
 
															   @patch("resource_management.libraries.functions.security_commons.validate_security_config_properties")