Bladeren bron

AMBARI-12013. Datanode failed to restart during RU because the shutdownDatanode -upgrade command can fail sometimes (alejandro)

Alejandro Fernandez 10 jaren geleden
bovenliggende
commit
5194615877

+ 3 - 1
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode.py

@@ -52,7 +52,9 @@ class DataNode(Script):
     # pre-upgrade steps shutdown the datanode, so there's no need to call
     # action=stop
     if rolling_restart:
-      datanode_upgrade.pre_upgrade_shutdown()
+      force_stop = datanode_upgrade.pre_upgrade_shutdown()
+      if force_stop:
+        datanode(action="stop")
     else:
       datanode(action="stop")
 

+ 12 - 4
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py

@@ -16,6 +16,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 
 """
+import re
 
 from resource_management.core.logger import Logger
 from resource_management.core.exceptions import Fail
@@ -31,7 +32,7 @@ def pre_upgrade_shutdown():
   DataNode in preparation for an upgrade. This will then periodically check
   "getDatanodeInfo" to ensure the DataNode has shutdown correctly.
   This function will obtain the Kerberos ticket if security is enabled.
-  :return:
+  :return: Return True if ran ok (even with errors), and False if need to stop the datanode forcefully.
   """
   import params
 
@@ -40,10 +41,17 @@ def pre_upgrade_shutdown():
     Execute(params.dn_kinit_cmd, user = params.hdfs_user)
 
   command = format('hdfs dfsadmin -shutdownDatanode {dfs_dn_ipc_address} upgrade')
-  Execute(command, user=params.hdfs_user, tries=1 )
 
-  # verify that the datanode is down
-  _check_datanode_shutdown()
+  code, output = shell.call(command, user=params.hdfs_user)
+  if code == 0:
+    # verify that the datanode is down
+    _check_datanode_shutdown()
+  else:
+    # Due to bug HDFS-7533, DataNode may not always shutdown during rolling upgrade, and it is necessary to kill it.
+    if output is not None and re.search("Shutdown already in progress", output):
+      Logger.error("Due to a known issue in DataNode, the command {0} did not work and will shutdown the datanode forcefully.")
+      return False
+  return True
 
 
 def post_upgrade_check():

+ 10 - 6
ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py

@@ -545,9 +545,12 @@ class TestDatanode(RMFTestCase):
       self.assertEqual(mocks_dict['call'].call_count,12)
 
 
+  @patch("resource_management.core.shell.call")
   @patch('time.sleep')
-  def test_stop_during_upgrade(self, time_mock):
+  def test_stop_during_upgrade(self, time_mock, call_mock):
     config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
+    call_mock_side_effects = [(0, ""), ]
+    call_mock.side_effects = call_mock_side_effects
     with open(config_file, "r") as f:
       json_content = json.load(f)
 
@@ -561,15 +564,16 @@ class TestDatanode(RMFTestCase):
         config_dict = json_content,
         hdp_stack_version = self.STACK_VERSION,
         target = RMFTestCase.TARGET_COMMON_SERVICES,
+        call_mocks = call_mock_side_effects,
         command_args=[True])
 
       raise Fail("Expected a fail since datanode didn't report a shutdown")
-    except:
-      pass
-
-    self.assertResourceCalled('Execute', 'hdfs dfsadmin -shutdownDatanode 0.0.0.0:8010 upgrade', user="hdfs", tries=1)
-    self.assertResourceCalled('Execute', 'hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010', user="hdfs", tries=1)
+    except Exception, err:
+      expected_message = "DataNode has not shutdown."
+      if str(err.message) != expected_message:
+        self.fail("Expected this exception to be thrown. " + expected_message + ". Got this instead, " + str(err.message))
 
+    self.assertResourceCalled("Execute", "hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010", tries=1, user="hdfs")
 
   @patch("resource_management.libraries.functions.security_commons.build_expectations")
   @patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem")