浏览代码

AMBARI-12013. Datanode failed to restart during RU because the shutdownDatanode -upgrade command can fail sometimes (alejandro)

Alejandro Fernandez 10 年之前
父节点
当前提交
5194615877

+ 3 - 1
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode.py

@@ -52,7 +52,9 @@ class DataNode(Script):
     # pre-upgrade steps shutdown the datanode, so there's no need to call
     # pre-upgrade steps shutdown the datanode, so there's no need to call
     # action=stop
     # action=stop
     if rolling_restart:
     if rolling_restart:
-      datanode_upgrade.pre_upgrade_shutdown()
+      force_stop = datanode_upgrade.pre_upgrade_shutdown()
+      if force_stop:
+        datanode(action="stop")
     else:
     else:
       datanode(action="stop")
       datanode(action="stop")
 
 

+ 12 - 4
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py

@@ -16,6 +16,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 limitations under the License.
 
 
 """
 """
+import re
 
 
 from resource_management.core.logger import Logger
 from resource_management.core.logger import Logger
 from resource_management.core.exceptions import Fail
 from resource_management.core.exceptions import Fail
@@ -31,7 +32,7 @@ def pre_upgrade_shutdown():
   DataNode in preparation for an upgrade. This will then periodically check
   DataNode in preparation for an upgrade. This will then periodically check
   "getDatanodeInfo" to ensure the DataNode has shutdown correctly.
   "getDatanodeInfo" to ensure the DataNode has shutdown correctly.
   This function will obtain the Kerberos ticket if security is enabled.
   This function will obtain the Kerberos ticket if security is enabled.
-  :return:
+  :return: Return True if ran ok (even with errors), and False if need to stop the datanode forcefully.
   """
   """
   import params
   import params
 
 
@@ -40,10 +41,17 @@ def pre_upgrade_shutdown():
     Execute(params.dn_kinit_cmd, user = params.hdfs_user)
     Execute(params.dn_kinit_cmd, user = params.hdfs_user)
 
 
   command = format('hdfs dfsadmin -shutdownDatanode {dfs_dn_ipc_address} upgrade')
   command = format('hdfs dfsadmin -shutdownDatanode {dfs_dn_ipc_address} upgrade')
-  Execute(command, user=params.hdfs_user, tries=1 )
 
 
-  # verify that the datanode is down
-  _check_datanode_shutdown()
+  code, output = shell.call(command, user=params.hdfs_user)
+  if code == 0:
+    # verify that the datanode is down
+    _check_datanode_shutdown()
+  else:
+    # Due to bug HDFS-7533, DataNode may not always shutdown during rolling upgrade, and it is necessary to kill it.
+    if output is not None and re.search("Shutdown already in progress", output):
+      Logger.error("Due to a known issue in DataNode, the command {0} did not work and will shutdown the datanode forcefully.")
+      return False
+  return True
 
 
 
 
 def post_upgrade_check():
 def post_upgrade_check():

+ 10 - 6
ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py

@@ -545,9 +545,12 @@ class TestDatanode(RMFTestCase):
       self.assertEqual(mocks_dict['call'].call_count,12)
       self.assertEqual(mocks_dict['call'].call_count,12)
 
 
 
 
+  @patch("resource_management.core.shell.call")
   @patch('time.sleep')
   @patch('time.sleep')
-  def test_stop_during_upgrade(self, time_mock):
+  def test_stop_during_upgrade(self, time_mock, call_mock):
     config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
     config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
+    call_mock_side_effects = [(0, ""), ]
+    call_mock.side_effects = call_mock_side_effects
     with open(config_file, "r") as f:
     with open(config_file, "r") as f:
       json_content = json.load(f)
       json_content = json.load(f)
 
 
@@ -561,15 +564,16 @@ class TestDatanode(RMFTestCase):
         config_dict = json_content,
         config_dict = json_content,
         hdp_stack_version = self.STACK_VERSION,
         hdp_stack_version = self.STACK_VERSION,
         target = RMFTestCase.TARGET_COMMON_SERVICES,
         target = RMFTestCase.TARGET_COMMON_SERVICES,
+        call_mocks = call_mock_side_effects,
         command_args=[True])
         command_args=[True])
 
 
       raise Fail("Expected a fail since datanode didn't report a shutdown")
       raise Fail("Expected a fail since datanode didn't report a shutdown")
-    except:
-      pass
-
-    self.assertResourceCalled('Execute', 'hdfs dfsadmin -shutdownDatanode 0.0.0.0:8010 upgrade', user="hdfs", tries=1)
-    self.assertResourceCalled('Execute', 'hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010', user="hdfs", tries=1)
+    except Exception, err:
+      expected_message = "DataNode has not shutdown."
+      if str(err.message) != expected_message:
+        self.fail("Expected this exception to be thrown. " + expected_message + ". Got this instead, " + str(err.message))
 
 
+    self.assertResourceCalled("Execute", "hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010", tries=1, user="hdfs")
 
 
   @patch("resource_management.libraries.functions.security_commons.build_expectations")
   @patch("resource_management.libraries.functions.security_commons.build_expectations")
   @patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem")
   @patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem")