před 10 roky · cb0ef79427
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py
@@ -73,7 +73,8 @@ def namenode(action=None, do_format=True, rolling_restart=False, env=None):
 
				     options = "-rollingUpgrade started" if rolling_restart else ""
			
 
				 
			
 
				     if rolling_restart:
			
 
				-      # Must start Zookeeper Failover Controller if it exists on this host because it could have been killed in order to initiate the failover.
			
 
				+      # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried
			
 
				+      # to kill ZKFC manually, so we need to start it if not already running.
			
 
				       safe_zkfc_op(action, env)
			
 
				 
			
 
				     service(
			
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py
@@ -46,7 +46,7 @@ import namenode_upgrade
 
				 from hdfs_namenode import namenode
			
 
				 from hdfs import hdfs
			
 
				 import hdfs_rebalance
			
 
				-from utils import stop_zkfc_during_ru
			
 
				+from utils import initiate_safe_zkfc_failover
			
 
				 
			
 
				 
			
 
				 # hashlib is supplied as of Python 2.5 as the replacement interface for md5
			
@@ -86,7 +86,7 @@ class NameNode(Script):
 
				     env.set_params(params)
			
 
				     if rolling_restart and params.dfs_ha_enabled:
			
 
				       if params.dfs_ha_automatic_failover_enabled:
			
 
				-        stop_zkfc_during_ru()
			
 
				+        initiate_safe_zkfc_failover()
			
 
				       else:
			
 
				         raise Fail("Rolling Upgrade - dfs.ha.automatic-failover.enabled must be enabled to perform a rolling restart")
			
 
				     namenode(action="stop", rolling_restart=rolling_restart, env=env)
			
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py
@@ -245,9 +245,13 @@ dfs_ha_namenode_active = default("/configurations/hadoop-env/dfs_ha_initial_name
 
				 # hostname of the standby HDFS HA Namenode (only used when HA is enabled)
			
 
				 dfs_ha_namenode_standby = default("/configurations/hadoop-env/dfs_ha_initial_namenode_standby", None)
			
 
				 
			
 
				+# Values for the current Host
			
 
				 namenode_id = None
			
 
				 namenode_rpc = None
			
 
				 
			
 
				+dfs_ha_namemodes_ids_list = []
			
 
				+other_namenode_id = None
			
 
				+
			
 
				 if dfs_ha_namenode_ids:
			
 
				   dfs_ha_namemodes_ids_list = dfs_ha_namenode_ids.split(",")
			
 
				   dfs_ha_namenode_ids_array_len = len(dfs_ha_namemodes_ids_list)
			
@@ -262,6 +266,11 @@ if dfs_ha_enabled:
 
				   # With HA enabled namenode_address is recomputed
			
 
				   namenode_address = format('hdfs://{dfs_ha_nameservices}')
			
 
				 
			
 
				+  # Calculate the namenode id of the other namenode. This is needed during RU to initiate an HA failover using ZKFC.
			
 
				+  if namenode_id is not None and len(dfs_ha_namemodes_ids_list) == 2:
			
 
				+    other_namenode_id = list(set(dfs_ha_namemodes_ids_list) - set([namenode_id]))[0]
			
 
				+
			
 
				+
			
 
				 if dfs_http_policy is not None and dfs_http_policy.upper() == "HTTPS_ONLY":
			
 
				   https_only = True
			
 
				   journalnode_address = default('/configurations/hdfs-site/dfs.journalnode.https-address', None)
			
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/utils.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/utils.py
@@ -60,19 +60,19 @@ def safe_zkfc_op(action, env):
 
				       if zkfc:
			
 
				         zkfc.stop(env)
			
 
				 
			
 
				-
			
 
				-def stop_zkfc_during_ru():
			
 
				+def initiate_safe_zkfc_failover():
			
 
				   """
			
 
				-  Restart ZKFC on either the standby or active Namenode. If done on the currently active namenode,
			
 
				-  wait for it to become the standby.
			
 
				-  This will run a kinit before executing the 'hdfs haadmin' command.
			
 
				+  If this is the active namenode, initiate a safe failover and wait for it to become the standby.
			
 
				+
			
 
				+  If an error occurs, force a failover to happen by killing zkfc on this host. In this case, during the Restart,
			
 
				+  will also have to start ZKFC manually.
			
 
				   """
			
 
				   import params
			
 
				 
			
 
				-  # must kinit before running the HDFS command
			
 
				+  # Must kinit before running the HDFS command
			
 
				   if params.security_enabled:
			
 
				-      Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
			
 
				-        user = params.hdfs_user)
			
 
				+    Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
			
 
				+            user = params.hdfs_user)
			
 
				 
			
 
				   check_service_cmd = format("hdfs haadmin -getServiceState {namenode_id}")
			
 
				   code, out = shell.call(check_service_cmd, logoutput=True, user=params.hdfs_user)
			
@@ -82,34 +82,39 @@ def stop_zkfc_during_ru():
 
				     original_state = "active" if "active" in out else ("standby" if "standby" in out else original_state)
			
 
				     Logger.info("Namenode service state: %s" % original_state)
			
 
				 
			
 
				-    msg = "Rolling Upgrade - Killing ZKFC on {0} NameNode host {1} {2}"\
			
 
				-      .format(original_state, params.hostname, "to initiate a failover" if original_state == "active" else "")
			
 
				-    Logger.info(msg)
			
 
				-
			
 
				-    # Forcefully kill ZKFC. If this is the active, will initiate a failover.
			
 
				-    # If ZKFC is already dead, then potentially this node can still be the active one.
			
 
				-    was_zkfc_killed = kill_zkfc(params.hdfs_user)
			
 
				-
			
 
				-    # Wait until it transitions to standby
			
 
				-    check_standby_cmd = format("hdfs haadmin -getServiceState {namenode_id} | grep standby")
			
 
				+    if original_state == "active":
			
 
				+      msg = "Rolling Upgrade - Initiating a ZKFC failover on {0} NameNode host {1}.".format(original_state, params.hostname)
			
 
				+      Logger.info(msg)
			
 
				 
			
 
				-    # process may already be down.  try one time, then proceed
			
 
				+      check_standby_cmd = format("hdfs haadmin -getServiceState {namenode_id} | grep standby")
			
 
				+      failover_command = format("hdfs haadmin -failover {namenode_id} {other_namenode_id}")
			
 
				 
			
 
				-    if original_state == "active":
			
 
				-      code, out = shell.call(check_standby_cmd, user=params.hdfs_user, logoutput=True)
			
 
				-      Logger.info(format("Rolling Upgrade - check for standby returned {code}"))
			
 
				+      code, out = shell.call(failover_command, user=params.hdfs_user, logoutput=True)
			
 
				+      Logger.info(format("Rolling Upgrade - failover command returned {code}"))
			
 
				+      wait_for_standby = False
			
 
				 
			
 
				-      if code == 255 and out:
			
 
				-        Logger.info("Rolling Upgrade - namenode is already down.")
			
 
				+      if code == 0:
			
 
				+        wait_for_standby = True
			
 
				       else:
			
 
				-        if was_zkfc_killed:
			
 
				-          # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover.
			
 
				-          Logger.info("Waiting for this NameNode to become the standby one.")
			
 
				-          Execute(check_standby_cmd,
			
 
				-                  user=params.hdfs_user,
			
 
				-                  tries=50,
			
 
				-                  try_sleep=6,
			
 
				-                  logoutput=True)
			
 
				+        # Try to kill ZKFC manually
			
 
				+        was_zkfc_killed = kill_zkfc(params.hdfs_user)
			
 
				+        code, out = shell.call(check_standby_cmd, user=params.hdfs_user, logoutput=True)
			
 
				+        Logger.info(format("Rolling Upgrade - check for standby returned {code}"))
			
 
				+
			
 
				+        if code == 255 and out:
			
 
				+          Logger.info("Rolling Upgrade - namenode is already down.")
			
 
				+        else:
			
 
				+          if was_zkfc_killed:
			
 
				+            # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover.
			
 
				+            wait_for_standby = True
			
 
				+
			
 
				+      if wait_for_standby:
			
 
				+        Logger.info("Waiting for this NameNode to become the standby one.")
			
 
				+        Execute(check_standby_cmd,
			
 
				+                user=params.hdfs_user,
			
 
				+                tries=50,
			
 
				+                try_sleep=6,
			
 
				+                logoutput=True)
			
 
				   else:
			
 
				     raise Fail("Unable to determine NameNode HA states by calling command: {0}".format(check_service_cmd))
			
 
				 
			
--- a/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/upgrade-2.3.xml
+++ b/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/upgrade-2.3.xml
@@ -83,8 +83,8 @@
 
				       <service-check>false</service-check>
			
 
				       <service name="HDFS">
			
 
				         <component>JOURNALNODE</component>
			
 
				-        <component>NAMENODE</component>
			
 
				         <component>ZKFC</component>
			
 
				+        <component>NAMENODE</component>
			
 
				       </service>
			
 
				 
			
 
				       <service name="MAPREDUCE2">
			
--- a/ambari-server/src/main/resources/stacks/HDP/2.3/upgrades/upgrade-2.3.xml
+++ b/ambari-server/src/main/resources/stacks/HDP/2.3/upgrades/upgrade-2.3.xml
@@ -99,6 +99,7 @@
 
				       <service-check>false</service-check>
			
 
				       <service name="HDFS">
			
 
				         <component>JOURNALNODE</component>
			
 
				+        <component>ZKFC</component>
			
 
				         <component>NAMENODE</component>
			
 
				       </service>
			
 
				 
			
@@ -406,6 +407,12 @@
 
				           <task xsi:type="restart" />
			
 
				         </upgrade>
			
 
				       </component>
			
 
				+
			
 
				+      <component name="ZKFC">
			
 
				+        <upgrade>
			
 
				+          <task xsi:type="restart" />
			
 
				+        </upgrade>
			
 
				+      </component>
			
 
				     </service>
			
 
				 
			
 
				     <service name="MAPREDUCE2">