فهرست منبع

AMBARI-17339. Ambari alert "NameNode Last Checkpoint" failing when NameNode is HA (aonishuk)

Andrew Onishuk 9 سال پیش
والد
کامیت
2cbd217c79

+ 47 - 1
ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py

@@ -33,6 +33,10 @@ __all__ = ["get_namenode_states", "get_active_namenode",
 HDFS_NN_STATE_ACTIVE = 'active'
 HDFS_NN_STATE_STANDBY = 'standby'
 
+NAMENODE_HTTP_NON_HA = 'dfs.namenode.http-address'
+NAMENODE_HTTPS_NON_HA = 'dfs.namenode.https-address'
+DFS_HTTP_POLICY = "dfs.http.policy"
+
 NAMENODE_HTTP_FRAGMENT = 'dfs.namenode.http-address.{0}.{1}'
 NAMENODE_HTTPS_FRAGMENT = 'dfs.namenode.https-address.{0}.{1}'
 NAMENODE_RPC_FRAGMENT = 'dfs.namenode.rpc-address.{0}.{1}'
@@ -121,7 +125,7 @@ def get_namenode_states_noretries(hdfs_site, security_enabled, run_user):
 def is_ha_enabled(hdfs_site):
   dfs_ha_nameservices = get_nameservice(hdfs_site)
   
-  if is_empty(dfs_ha_nameservices):
+  if not dfs_ha_nameservices or is_empty(dfs_ha_nameservices):
     return False
   
   dfs_ha_namenode_ids = hdfs_site[format("dfs.ha.namenodes.{dfs_ha_nameservices}")]
@@ -174,6 +178,48 @@ def get_property_for_active_namenode(hdfs_site, property_name, security_enabled,
 
   return value
 
+def get_all_namenode_addresses(hdfs_site):
+  """
+  - In non-ha mode it will return list of hdfs_site[dfs.namenode.http[s]-address]
+  - In ha-mode it will return list of hdfs_site[dfs.namenode.http-address.NS.Uid], where NS is the name of HA, and Uid is id of NameNode
+  """
+  nn_addresses = []
+  http_policy = 'HTTP_ONLY'
+
+  if DFS_HTTP_POLICY in hdfs_site:
+    http_policy = hdfs_site[DFS_HTTP_POLICY]
+
+  if is_ha_enabled(hdfs_site):
+    name_service = get_nameservice(hdfs_site)
+    nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
+    nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
+    for nn_unique_id in nn_unique_ids:
+      rpc_key = NAMENODE_RPC_FRAGMENT.format(name_service,nn_unique_id)
+      if http_policy == 'HTTPS_ONLY':
+        key = NAMENODE_HTTPS_FRAGMENT.format(name_service,nn_unique_id)
+      else:
+        key = NAMENODE_HTTP_FRAGMENT.format(name_service,nn_unique_id)
+      if key in hdfs_site:
+        # use str() to ensure that unicode strings do not have the u' in them
+        value = str(hdfs_site[key])
+        if INADDR_ANY in value and rpc_key in hdfs_site:
+          rpc_value = str(hdfs_site[rpc_key])
+          if INADDR_ANY not in rpc_value:
+            rpc_host = rpc_value.split(":")[0]
+            value = value.replace(INADDR_ANY, rpc_host)
+
+        if not value in nn_addresses:
+          nn_addresses.append(value)
+  else:
+    if http_policy == 'HTTPS_ONLY':
+      if NAMENODE_HTTPS_NON_HA in hdfs_site:
+        nn_addresses.append(hdfs_site[NAMENODE_HTTPS_NON_HA])
+    else:
+      if NAMENODE_HTTP_NON_HA in hdfs_site:
+        nn_addresses.append(hdfs_site[NAMENODE_HTTP_NON_HA])
+
+  return nn_addresses
+
 def get_nameservice(hdfs_site):
   """
   Multiple nameservices can be configured for example to support seamless distcp

+ 21 - 13
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py

@@ -24,12 +24,17 @@ import ambari_simplejson as json # simplejson is much faster comparing to Python
 import logging
 import traceback
 
+from resource_management.libraries.functions.namenode_ha_utils import get_all_namenode_addresses
 from resource_management.libraries.functions.curl_krb_request import curl_krb_request
 from resource_management.libraries.functions.curl_krb_request import DEFAULT_KERBEROS_KINIT_TIMER_MS
 from resource_management.libraries.functions.curl_krb_request import KERBEROS_KINIT_TIMER_PARAMETER
 from resource_management.core.environment import Environment
 
 LABEL = 'Last Checkpoint: [{h} hours, {m} minutes, {tx} transactions]'
+HDFS_SITE_KEY = '{{hdfs-site}}'
+
+RESULT_STATE_UNKNOWN = 'UNKNOWN'
+RESULT_STATE_SKIPPED = 'SKIPPED'
 
 NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
 NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
@@ -68,7 +73,7 @@ def get_tokens():
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
   """
-  return (NN_HTTP_ADDRESS_KEY, NN_HTTPS_ADDRESS_KEY, NN_HTTP_POLICY_KEY, EXECUTABLE_SEARCH_PATHS,
+  return (HDFS_SITE_KEY, NN_HTTP_ADDRESS_KEY, NN_HTTPS_ADDRESS_KEY, NN_HTTP_POLICY_KEY, EXECUTABLE_SEARCH_PATHS,
       NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY, SMOKEUSER_KEY)
   
 
@@ -92,12 +97,10 @@ def execute(configurations={}, parameters={}, host_name=None):
   http_policy = 'HTTP_ONLY'
   checkpoint_tx = CHECKPOINT_TX_DEFAULT
   checkpoint_period = CHECKPOINT_PERIOD_DEFAULT
-  
-  if NN_HTTP_ADDRESS_KEY in configurations:
-    http_uri = configurations[NN_HTTP_ADDRESS_KEY]
 
-  if NN_HTTPS_ADDRESS_KEY in configurations:
-    https_uri = configurations[NN_HTTPS_ADDRESS_KEY]
+  # hdfs-site is required
+  if not HDFS_SITE_KEY in configurations:
+    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
 
   if NN_HTTP_POLICY_KEY in configurations:
     http_policy = configurations[NN_HTTP_POLICY_KEY]
@@ -152,13 +155,18 @@ def execute(configurations={}, parameters={}, host_name=None):
   kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS)
 
   # determine the right URI and whether to use SSL
-  uri = http_uri
-  if http_policy == 'HTTPS_ONLY':
-    scheme = 'https'
-    
-    if https_uri is not None:
-      uri = https_uri 
-  
+  hdfs_site = configurations[HDFS_SITE_KEY]
+
+  scheme = "https" if http_policy == "HTTPS_ONLY" else "http"
+
+  nn_addresses = get_all_namenode_addresses(hdfs_site)
+  for nn_address in nn_addresses:
+    if nn_address.startswith(host_name + ":"):
+      uri = nn_address
+      break
+  if not uri:
+    return (RESULT_STATE_SKIPPED, ['NameNode on host {0} not found (namenode adresses = {1})'.format(host_name, ', '.join(nn_addresses))])
+
   current_time = int(round(time.time() * 1000))
 
   last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme,uri)

+ 6 - 2
ambari-server/src/test/python/stacks/2.0.6/HDFS/test_alert_checkpoint_time.py

@@ -53,7 +53,11 @@ class TestAlertCheckpointTime(RMFTestCase):
 
     import alert_checkpoint_time as alert
     global configs
-    configs = {
+    configs = { "{{hdfs-site}}" : {'dfs.namenode.http-address' : 'c6401.ambari.apache.org:50470',
+                                   'dfs.http.policy': 'HTTP_ONLY',
+                                   'dfs.namenode.checkpoint.period': 100,
+                                   'security_enabled': 'false',
+                                   'dfs.namenode.checkpoint.txns': 100},
       '{{hdfs-site/dfs.namenode.http-address}}': 'c6401.ambari.apache.org:50470',
       '{{hdfs-site/dfs.http.policy}}': 'HTTP_ONLY',
       '{{hdfs-site/dfs.namenode.checkpoint.period}}': 100,
@@ -83,7 +87,7 @@ class TestAlertCheckpointTime(RMFTestCase):
     response.read.return_value = json.dumps(jmx_output)
     urlopen_mock.return_value = response
 
-    [status, messages] = alert.execute(configurations=configs, parameters=parameters)
+    [status, messages] = alert.execute(configurations=configs, parameters=parameters, host_name="c6401.ambari.apache.org")
 
     self.assertEqual(status, RESULT_STATE_CRITICAL)
     self.assertEqual(messages[0], 'Last Checkpoint: [1 hours, 0 minutes, 1000 transactions]')