Jelajahi Sumber

AMBARI-11605. Restarting HistoryServer fails during RU because NameNode is in safemode (alejandro)

Alejandro Fernandez 10 tahun lalu
induk
melakukan
b6c115ba24

+ 54 - 24
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py

@@ -18,12 +18,24 @@ limitations under the License.
 """
 import os.path
 
-from resource_management import *
-from resource_management.core.logger import Logger
-from resource_management.core.exceptions import ComponentIsNotRunning
+
+from resource_management.core import shell
+from resource_management.core.source import Template
+from resource_management.core.resources.system import File, Execute, Directory
+from resource_management.core.resources.service import Service
+from resource_management.libraries.functions.format import format
 from resource_management.libraries.functions.check_process_status import check_process_status
+from resource_management.libraries.resources.execute_hadoop import ExecuteHadoop
+from ambari_commons import OSCheck, OSConst
 from ambari_commons.os_family_impl import OsFamilyImpl, OsFamilyFuncImpl
-from ambari_commons import OSConst
+
+if OSCheck.is_windows_family():
+  from resource_management.libraries.functions.windows_service_utils import check_windows_service_status
+
+from resource_management.core.shell import as_user
+from resource_management.core.exceptions import Fail
+from resource_management.core.logger import Logger
+
 from utils import service, safe_zkfc_op
 from setup_ranger_hdfs import setup_ranger_hdfs
 
@@ -77,24 +89,40 @@ def namenode(action=None, do_format=True, rolling_restart=False, env=None):
       Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
               user = params.hdfs_user)
 
+    is_namenode_safe_mode_off = format("hadoop dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'")
+    is_active_namenode_cmd = as_user(format("hdfs --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
+
+    # During normal operations, if HA is enabled and it is in standby, then stay in current state, otherwise, leave safemode.
+    # During Rolling Upgrade, both namenodes must leave safemode.
+
+    # ___Scenario_________|_Expected safemode state___|_Wait for safemode OFF____|
+    # 1 (HA and active)   | ON -> OFF                 | Yes                      |
+    # 2 (HA and standby)  | no change (yes during RU) | no check (yes during RU) |
+    # 3 (no-HA)           | ON -> OFF                 | Yes                      |
+    leave_safe_mode = False
+    msg = ""
     if params.dfs_ha_enabled:
-      dfs_check_nn_status_cmd = as_user(format("hdfs --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
+      code, out = shell.call(is_active_namenode_cmd, logoutput=True) # If active NN, code will be 0
+      if code == 0: # active
+        leave_safe_mode = True
+        msg = "Must leave safemode since High Availability is enabled and this is the Active NameNode."
+      elif rolling_restart:
+        leave_safe_mode = True
+        msg = "Must leave safemode since High Availability is enabled during a Rolling Upgrade"
     else:
-      dfs_check_nn_status_cmd = None
+      msg = "Must leave safemode since High Availability is not enabled."
+      leave_safe_mode = True
 
-    namenode_safe_mode_off = format("hadoop dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'")
-
-    # If HA is enabled and it is in standby, then stay in safemode, otherwise, leave safemode.
-    leave_safe_mode = True
-    if dfs_check_nn_status_cmd is not None:
-      code, out = shell.call(dfs_check_nn_status_cmd) # If active NN, code will be 0
-      if code != 0:
-        leave_safe_mode = False
+    if not msg:
+      msg = "Will remain in the current safemode state."
+    Logger.info(msg)
 
     if leave_safe_mode:
       # First check if Namenode is not in 'safemode OFF' (equivalent to safemode ON), if so, then leave it
-      code, out = shell.call(namenode_safe_mode_off, user=params.hdfs_user)
+      Logger.info("Checking the NameNode safemode status since may need to transition from ON to OFF.")
+      code, out = shell.call(is_namenode_safe_mode_off, user=params.hdfs_user)
       if code != 0:
+        Logger.info("Will need to leave safemode, state should be OFF.")
         leave_safe_mode_cmd = format("hdfs --config {hadoop_conf_dir} dfsadmin -fs {namenode_address} -safemode leave")
         Execute(leave_safe_mode_cmd,
                 tries=10,
@@ -103,15 +131,17 @@ def namenode(action=None, do_format=True, rolling_restart=False, env=None):
                 path=[params.hadoop_bin_dir],
         )
 
-    # Verify if Namenode should be in safemode OFF
-    Execute(namenode_safe_mode_off,
-            tries=40,
-            try_sleep=10,
-            path=[params.hadoop_bin_dir],
-            user=params.hdfs_user,
-            only_if=dfs_check_nn_status_cmd #skip when HA not active
-    )
-    create_hdfs_directories(dfs_check_nn_status_cmd)
+        Logger.info("Checking if safemode state is now OFF.")
+        # Verify if Namenode should be in safemode OFF
+        Execute(is_namenode_safe_mode_off,
+                tries=40,
+                try_sleep=10,
+                path=[params.hadoop_bin_dir],
+                user=params.hdfs_user
+        )
+
+      # Always run on this non-HA, or active NameNode during HA.
+      create_hdfs_directories(is_active_namenode_cmd)
   elif action == "stop":
     import params
     service(

+ 15 - 8
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py

@@ -19,27 +19,34 @@ limitations under the License.
 
 import sys
 import os
+import json
+import tempfile
+from datetime import datetime
 import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
-import  tempfile
-from resource_management import *
+
+from resource_management import Script
+from resource_management.core.resources.system import Execute
 from resource_management.libraries.functions import conf_select
 from resource_management.libraries.functions import hdp_select
+from resource_management.libraries.functions.version import compare_versions, format_hdp_stack_version
+from resource_management.libraries.functions.format import format
 from resource_management.libraries.functions.security_commons import build_expectations, \
   cached_kinit_executor, get_params_from_filesystem, validate_security_config_properties, \
   FILE_TYPE_XML
-from resource_management.libraries.functions.version import compare_versions, \
-  format_hdp_stack_version
-from resource_management.libraries.functions.format import format
+
 from resource_management.core.exceptions import Fail
-from datetime import datetime
+from resource_management.core.shell import as_user
+from resource_management.core.logger import Logger
+
+from ambari_commons.os_family_impl import OsFamilyImpl
+from ambari_commons import OSConst
 
 import namenode_upgrade
 from hdfs_namenode import namenode
 from hdfs import hdfs
 import hdfs_rebalance
 from utils import failover_namenode
-from ambari_commons.os_family_impl import OsFamilyImpl
-from ambari_commons import OSConst
+
 
 # hashlib is supplied as of Python 2.5 as the replacement interface for md5
 # and other secure hashes.  In 2.6, md5 is deprecated.  Import hashlib if

+ 13 - 8
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/utils.py

@@ -21,10 +21,14 @@ import re
 import urllib2
 import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
 
-from resource_management import *
+from resource_management.core.resources.system import Directory, File, Execute
 from resource_management.libraries.functions.format import format
-from resource_management.core.shell import call, checked_call
+from resource_management.libraries.functions import check_process_status
+from resource_management.libraries.functions.version import compare_versions
+from resource_management.core import shell
+from resource_management.core.shell import as_user, as_sudo
 from resource_management.core.exceptions import ComponentIsNotRunning
+from resource_management.core.logger import Logger
 
 from zkfc_slave import ZkfcSlave
 
@@ -34,6 +38,7 @@ def safe_zkfc_op(action, env):
   :param action: start or stop
   :param env: environment
   """
+  Logger.info("Performing action {0} on zkfc.".format(action))
   zkfc = None
   if action == "start":
     try:
@@ -60,7 +65,7 @@ def failover_namenode():
   """
   import params
   check_service_cmd = format("hdfs haadmin -getServiceState {namenode_id}")
-  code, out = call(check_service_cmd, logoutput=True, user=params.hdfs_user)
+  code, out = shell.call(check_service_cmd, logoutput=True, user=params.hdfs_user)
 
   state = "unknown"
   if code == 0 and out:
@@ -78,22 +83,22 @@ def failover_namenode():
     check_standby_cmd = format("hdfs haadmin -getServiceState {namenode_id} | grep standby")
 
     # process may already be down.  try one time, then proceed
-    code, out = call(check_standby_cmd, user=params.hdfs_user, logoutput=True)
+    code, out = shell.call(check_standby_cmd, user=params.hdfs_user, logoutput=True)
     Logger.info(format("Rolling Upgrade - check for standby returned {code}"))
 
     if code == 255 and out:
-      Logger.info("Rolling Upgrade - namenode is already down")
+      Logger.info("Rolling Upgrade - namenode is already down.")
     else:
       if was_zkfc_killed:
         # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover.
+        Logger.info("Waiting for this NameNode to become the standby one.")
         Execute(check_standby_cmd,
                 user=params.hdfs_user,
                 tries=50,
                 try_sleep=6,
                 logoutput=True)
-
   else:
-    Logger.info("Rolling Upgrade - Host %s is the standby namenode." % str(params.hostname))
+    Logger.info("Rolling Upgrade - Host %s is already the standby namenode." % str(params.hostname))
 
 
 def kill_zkfc(zkfc_user):
@@ -109,7 +114,7 @@ def kill_zkfc(zkfc_user):
     zkfc_pid_file = get_service_pid_file("zkfc", zkfc_user)
     if zkfc_pid_file:
       check_process = format("ls {zkfc_pid_file} > /dev/null 2>&1 && ps -p `cat {zkfc_pid_file}` > /dev/null 2>&1")
-      code, out = call(check_process)
+      code, out = shell.call(check_process)
       if code == 0:
         Logger.debug("ZKFC is running and will be killed to initiate namenode failover.")
         kill_command = format("{check_process} && kill -9 `cat {zkfc_pid_file}` > /dev/null 2>&1")

+ 13 - 43
ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py

@@ -96,13 +96,12 @@ class TestNamenode(RMFTestCase):
     self.assertResourceCalled('Execute', "hadoop dfsadmin -fs hdfs://c6405.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
                               path = ['/usr/bin'],
                               tries = 40,
-                              only_if = None,
                               user = 'hdfs',
                               try_sleep = 10,
                               )
     self.assertResourceCalled('HdfsResource', '/tmp',
         security_enabled = False,
-        only_if = None,
+        only_if="ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState None | grep active'",
         keytab = UnknownConfigurationMock(),
         hadoop_bin_dir = '/usr/bin',
         default_fs = 'wasb://abc@c6401.ambari.apache.org',
@@ -118,7 +117,7 @@ class TestNamenode(RMFTestCase):
     )
     self.assertResourceCalled('HdfsResource', '/user/ambari-qa',
         security_enabled = False,
-        only_if = None,
+        only_if="ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState None | grep active'",
         keytab = UnknownConfigurationMock(),
         hadoop_bin_dir = '/usr/bin',
         default_fs = 'wasb://abc@c6401.ambari.apache.org',
@@ -134,7 +133,7 @@ class TestNamenode(RMFTestCase):
     )
     self.assertResourceCalled('HdfsResource', None,
         security_enabled = False,
-        only_if = None,
+        only_if="ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState None | grep active'",
         keytab = UnknownConfigurationMock(),
         hadoop_bin_dir = '/usr/bin',
         default_fs = 'wasb://abc@c6401.ambari.apache.org',
@@ -213,13 +212,12 @@ class TestNamenode(RMFTestCase):
     self.assertResourceCalled('Execute', "hadoop dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
         path = ['/usr/bin'],
         tries = 40,
-        only_if = None,
         user = 'hdfs',
         try_sleep = 10,
     )
     self.assertResourceCalled('HdfsResource', '/tmp',
         security_enabled = False,
-        only_if = None,
+        only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState None | grep active'",
         keytab = UnknownConfigurationMock(),
         hadoop_bin_dir = '/usr/bin',
         default_fs = 'hdfs://c6401.ambari.apache.org:8020',
@@ -235,7 +233,7 @@ class TestNamenode(RMFTestCase):
     )
     self.assertResourceCalled('HdfsResource', '/user/ambari-qa',
         security_enabled = False,
-        only_if = None,
+        only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState None | grep active'",
         keytab = UnknownConfigurationMock(),
         hadoop_bin_dir = '/usr/bin',
         default_fs = 'hdfs://c6401.ambari.apache.org:8020',
@@ -251,7 +249,7 @@ class TestNamenode(RMFTestCase):
     )
     self.assertResourceCalled('HdfsResource', None,
         security_enabled = False,
-        only_if = None,
+        only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState None | grep active'",
         keytab = UnknownConfigurationMock(),
         hadoop_bin_dir = '/usr/bin',
         default_fs = 'hdfs://c6401.ambari.apache.org:8020',
@@ -353,7 +351,6 @@ class TestNamenode(RMFTestCase):
     self.assertResourceCalled('Execute', "hadoop dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'",
         path = ['/usr/bin'],
         tries = 40,
-        only_if = None,
         user = 'hdfs',
         try_sleep = 10,
     )
@@ -368,7 +365,7 @@ class TestNamenode(RMFTestCase):
         type = 'directory',
         action = ['create_on_execute'], hdfs_site=self.getConfig()['configurations']['hdfs-site'], principal_name='hdfs', default_fs='hdfs://c6401.ambari.apache.org:8020',
         mode = 0777,
-        only_if = None,
+        only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState None | grep active'"
     )
     self.assertResourceCalled('HdfsResource', '/user/ambari-qa',
         security_enabled = True,
@@ -381,11 +378,11 @@ class TestNamenode(RMFTestCase):
         type = 'directory',
         action = ['create_on_execute'], hdfs_site=self.getConfig()['configurations']['hdfs-site'], principal_name='hdfs', default_fs='hdfs://c6401.ambari.apache.org:8020',
         mode = 0770,
-        only_if = None,
+        only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState None | grep active'"
     )
     self.assertResourceCalled('HdfsResource', None,
         security_enabled = True,
-        only_if = None,
+        only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState None | grep active'",
         keytab = '/etc/security/keytabs/hdfs.headless.keytab',
         hadoop_bin_dir = '/usr/bin',
         kinit_path_local = '/usr/bin/kinit',
@@ -451,13 +448,6 @@ class TestNamenode(RMFTestCase):
         environment = {'HADOOP_LIBEXEC_DIR': '/usr/lib/hadoop/libexec'},
         not_if = 'ls /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid >/dev/null 2>&1 && ps -p `cat /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid` >/dev/null 2>&1',
     )
-    self.assertResourceCalled('Execute', "hadoop dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'",
-        path = ['/usr/bin'],
-        tries = 40,
-        only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn1 | grep active'",
-        user = 'hdfs',
-        try_sleep = 10,
-    )
     self.assertResourceCalled('HdfsResource', '/tmp',
         security_enabled = False,
         only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn1 | grep active'",
@@ -543,13 +533,6 @@ class TestNamenode(RMFTestCase):
     self.assertResourceCalled('Execute', '/usr/bin/kinit -kt /etc/security/keytabs/hdfs.headless.keytab hdfs',
         user = 'hdfs',
     )
-    self.assertResourceCalled('Execute', "hadoop dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'",
-        path = ['/usr/bin'],
-        tries = 40,
-        only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn1 | grep active'",
-        user = 'hdfs',
-        try_sleep = 10,
-    )
     self.assertResourceCalled('HdfsResource', '/tmp',
         security_enabled = True,
         only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn1 | grep active'",
@@ -644,13 +627,6 @@ class TestNamenode(RMFTestCase):
                               environment = {'HADOOP_LIBEXEC_DIR': '/usr/lib/hadoop/libexec'},
                               not_if = 'ls /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid >/dev/null 2>&1 && ps -p `cat /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid` >/dev/null 2>&1',
                               )
-    self.assertResourceCalled('Execute', "hadoop dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'",
-                              path = ['/usr/bin'],
-                              tries = 40,
-                              only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn1 | grep active'",
-                              user = 'hdfs',
-                              try_sleep = 10,
-                              )
     self.assertResourceCalled('HdfsResource', '/tmp',
         security_enabled = False,
         only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn1 | grep active'",
@@ -742,13 +718,7 @@ class TestNamenode(RMFTestCase):
                               environment = {'HADOOP_LIBEXEC_DIR': '/usr/lib/hadoop/libexec'},
                               not_if = 'ls /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid >/dev/null 2>&1 && ps -p `cat /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid` >/dev/null 2>&1',
                               )
-    self.assertResourceCalled('Execute', "hadoop dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'",
-                              path = ['/usr/bin'],
-                              tries = 40,
-                              only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn2 | grep active'",
-                              user = 'hdfs',
-                              try_sleep = 10,
-                              )
+
     self.assertResourceCalled('HdfsResource', '/tmp',
         security_enabled = False,
         only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn2 | grep active'",
@@ -1300,7 +1270,7 @@ class TestNamenode(RMFTestCase):
                               )
     self.assertNoMoreResources()
 
-  @patch("resource_management.core.shell.call")
+  @patch.object(shell, "call")
   def test_pre_rolling_restart_21_and_lower_params(self, call_mock):
     config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/nn_ru_lzo.json"
     with open(config_file, "r") as f:
@@ -1323,7 +1293,7 @@ class TestNamenode(RMFTestCase):
     self.assertEquals("/usr/bin", sys.modules["params"].hadoop_bin_dir)
     self.assertEquals("/usr/lib/hadoop/sbin", sys.modules["params"].hadoop_bin)
 
-  @patch("resource_management.core.shell.call")
+  @patch.object(shell, "call")
   def test_pre_rolling_restart_22_params(self, call_mock):
     config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/nn_ru_lzo.json"
     with open(config_file, "r") as f:
@@ -1348,7 +1318,7 @@ class TestNamenode(RMFTestCase):
     self.assertEquals("/usr/hdp/current/hadoop-client/bin", sys.modules["params"].hadoop_bin_dir)
     self.assertEquals("/usr/hdp/current/hadoop-client/sbin", sys.modules["params"].hadoop_bin)
 
-  @patch("resource_management.core.shell.call")
+  @patch.object(shell, "call")
   def test_pre_rolling_restart_23_params(self, call_mock):
     import itertools