Explorar o código

AMBARI-11372. Namenode HA monitoring broken on cluster secured with HTTP SPNEGO (2.1).(vbrodetskyi)

Vitaly Brodetskyi %!s(int64=10) %!d(string=hai) anos
pai
achega
1d1e219c7e

+ 1 - 1
ambari-agent/src/main/python/ambari_agent/AlertSchedulerHandler.py

@@ -270,7 +270,7 @@ class AlertSchedulerHandler():
     alert = None
 
     if source_type == AlertSchedulerHandler.TYPE_METRIC:
-      alert = MetricAlert(json_definition, source)
+      alert = MetricAlert(json_definition, source, self.config)
     elif source_type == AlertSchedulerHandler.TYPE_PORT:
       alert = PortAlert(json_definition, source)
     elif source_type == AlertSchedulerHandler.TYPE_SCRIPT:

+ 78 - 20
ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py

@@ -25,18 +25,22 @@ import re
 import urllib2
 import uuid
 
+from  tempfile import gettempdir
 from alerts.base_alert import BaseAlert
 from ambari_commons.urllib_handlers import RefreshHeaderProcessor
 from resource_management.libraries.functions.get_port_from_url import get_port_from_url
+from resource_management.libraries.functions.curl_krb_request import curl_krb_request
 
 logger = logging.getLogger()
 
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+
 # default timeout
 DEFAULT_CONNECTION_TIMEOUT = 5.0
 
 class MetricAlert(BaseAlert):
   
-  def __init__(self, alert_meta, alert_source_meta):
+  def __init__(self, alert_meta, alert_source_meta, config):
     super(MetricAlert, self).__init__(alert_meta, alert_source_meta)
 
     connection_timeout = DEFAULT_CONNECTION_TIMEOUT
@@ -57,6 +61,7 @@ class MetricAlert(BaseAlert):
     # python uses 5.0, not 5
     self.connection_timeout = float(connection_timeout)
 
+    self.config = config
 
   def _collect(self):
     if self.metric_info is None:
@@ -85,14 +90,20 @@ class MetricAlert(BaseAlert):
     value_list = []
 
     if isinstance(self.metric_info, JmxMetric):
-      value_list.extend(self._load_jmx(alert_uri.is_ssl_enabled, host, port, self.metric_info))
-      check_value = self.metric_info.calculate(value_list)
-      value_list.append(check_value)
+      jmx_property_values, http_code = self._load_jmx(alert_uri.is_ssl_enabled, host, port, self.metric_info)
+      if not jmx_property_values and http_code in [200, 307]:
+        collect_result = self.RESULT_UNKNOWN
+        value_list.append('HTTP {0} response (metrics unavailable)'.format(str(http_code)))
+      elif not jmx_property_values and http_code not in [200, 307]:
+        raise Exception("[Alert][{0}] Unable to get json from jmx response!".format(self.get_name()))
+      else:
+        value_list.extend(jmx_property_values)
+        check_value = self.metric_info.calculate(value_list)
+        value_list.append(check_value)
       
-      collect_result = self.__get_result(value_list[0] if check_value is None else check_value)
+        collect_result = self.__get_result(value_list[0] if check_value is None else check_value)
 
-    logger.debug("[Alert][{0}] Resolved values = {1}".format(
-      self.get_name(), str(value_list)))
+        logger.debug("[Alert][{0}] Resolved values = {1}".format(self.get_name(), str(value_list)))
     
     return (collect_result, value_list)
 
@@ -155,10 +166,28 @@ class MetricAlert(BaseAlert):
   def _load_jmx(self, ssl, host, port, jmx_metric):
     """ creates a JmxMetric object that holds info about jmx-based metrics """
     value_list = []
+    kerberos_keytab = None
+    kerberos_principal = None
 
     if logger.isEnabledFor(logging.DEBUG):
       logger.debug(str(jmx_metric.property_map))
 
+    security_enabled = str(self._get_configuration_value(SECURITY_ENABLED_KEY)).upper() == 'TRUE'
+
+    if self.uri_property_keys.kerberos_principal is not None:
+      kerberos_principal = self._get_configuration_value(
+      self.uri_property_keys.kerberos_principal)
+
+      if kerberos_principal is not None:
+        # substitute _HOST in kerberos principal with actual fqdn
+        kerberos_principal = kerberos_principal.replace('_HOST', self.host_name)
+
+    if self.uri_property_keys.kerberos_keytab is not None:
+      kerberos_keytab = self._get_configuration_value(self.uri_property_keys.kerberos_keytab)
+
+    if "0.0.0.0" in str(host):
+      host = self.host_name
+
     for jmx_property_key, jmx_property_value in jmx_metric.property_map.iteritems():
       url = "{0}://{1}:{2}/jmx?qry={3}".format(
         "https" if ssl else "http", host, str(port), jmx_property_key)
@@ -166,10 +195,25 @@ class MetricAlert(BaseAlert):
       # use a customer header processor that will look for the non-standard
       # "Refresh" header and attempt to follow the redirect
       response = None
+      content = ''
       try:
-        url_opener = urllib2.build_opener(RefreshHeaderProcessor())
-        response = url_opener.open(url, timeout=self.connection_timeout)
-        content = response.read()
+        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
+          tmp_dir = self.config.get('agent', 'tmp_dir')
+          if tmp_dir is None:
+            tmp_dir = gettempdir()
+
+          kerberos_executable_search_paths = self._get_configuration_value('{{kerberos-env/executable_search_paths}}')
+
+          response, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab, kerberos_principal, url,
+                                          "metric_alert", kerberos_executable_search_paths, False, self.get_name())
+          content = response
+        else:
+          url_opener = urllib2.build_opener(RefreshHeaderProcessor())
+          response = url_opener.open(url, timeout=self.connection_timeout)
+          content = response.read()
+      except Exception, exception:
+        if logger.isEnabledFor(logging.DEBUG):
+          logger.exception("[Alert][{0}] Unable to make a web request: {1}".format(self.get_name(), str(exception)))
       finally:
         # explicitely close the connection as we've seen python hold onto these
         if response is not None:
@@ -179,16 +223,30 @@ class MetricAlert(BaseAlert):
             logger.debug("[Alert][{0}] Unable to close JMX URL connection to {1}".format
               (self.get_name(), url))
 
-      json_response = json.loads(content)
-      json_data = json_response['beans'][0]
-      
-      for attr in jmx_property_value:
-        if attr not in json_data:
-          raise Exception("Unable to find {0} in JSON from {1} ".format(attr, url))
-
-        value_list.append(json_data[attr])
-        
-    return value_list
+      json_is_valid = True
+      try:
+        json_response = json.loads(content)
+        json_data = json_response['beans'][0]
+      except Exception, exception:
+        json_is_valid = False
+        if logger.isEnabledFor(logging.DEBUG):
+          logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
+                         format(self.get_name(), str(exception)))
+
+      if json_is_valid:
+        for attr in jmx_property_value:
+          if attr not in json_data:
+            raise Exception("Unable to find {0} in JSON from {1} ".format(attr, url))
+
+          value_list.append(json_data[attr])
+
+      http_response_code = None
+      if not json_is_valid and security_enabled and \
+            kerberos_principal is not None and kerberos_keytab is not None:
+        http_response_code, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab, kerberos_principal, url,
+                                              "metric_alert", kerberos_executable_search_paths, True, self.get_name())
+
+    return (value_list, http_response_code)
 
   def _get_reporting_text(self, state):
     '''

+ 3 - 55
ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py

@@ -20,18 +20,15 @@ limitations under the License.
 
 import logging
 import time
-import subprocess
 import os
 import urllib2
 from urllib2 import HTTPError
-import uuid
 
 from  tempfile import gettempdir
 from alerts.base_alert import BaseAlert
 from collections import namedtuple
 from resource_management.libraries.functions.get_port_from_url import get_port_from_url
-from resource_management.libraries.functions import get_kinit_path
-from resource_management.libraries.functions import get_klist_path
+from resource_management.libraries.functions.curl_krb_request import curl_krb_request
 from ambari_commons import OSCheck
 from ambari_commons.inet_utils import resolve_address
 
@@ -175,60 +172,11 @@ class WebAlert(BaseAlert):
         if tmp_dir is None:
           tmp_dir = gettempdir()
 
-        ccache_file_name = _md5("{0}|{1}".format(kerberos_principal, kerberos_keytab)).hexdigest()
-        ccache_file_path = "{0}{1}web_alert_cc_{2}".format(tmp_dir, os.sep, ccache_file_name)
-        kerberos_env = {'KRB5CCNAME': ccache_file_path}
-
         # Get the configured Kerberos executables search paths, if any
         kerberos_executable_search_paths = self._get_configuration_value('{{kerberos-env/executable_search_paths}}')
 
-        # If there are no tickets in the cache or they are expired, perform a kinit, else use what
-        # is in the cache
-        klist_path_local = get_klist_path(kerberos_executable_search_paths)
-
-        if os.system("{0} -s {1}".format(klist_path_local, ccache_file_path)) != 0:
-          kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
-          logger.debug("[Alert][{0}] Enabling Kerberos authentication via GSSAPI using ccache at {1}.".format(
-            self.get_name(), ccache_file_path))
-
-          os.system("{0} -l 5m -c {1} -kt {2} {3} > /dev/null".format(
-            kinit_path_local, ccache_file_path, kerberos_keytab,
-            kerberos_principal))
-        else:
-          logger.debug("[Alert][{0}] Kerberos authentication via GSSAPI already enabled using ccache at {1}.".format(
-            self.get_name(), ccache_file_path))
-
-        # check if cookies dir exists, if not then create it
-        tmp_dir = self.config.get('agent', 'tmp_dir')
-        cookies_dir = os.path.join(tmp_dir, "cookies")
-
-        if not os.path.exists(cookies_dir):
-          os.makedirs(cookies_dir)
-
-        cookie_file_name = str(uuid.uuid4())
-        cookie_file = os.path.join(cookies_dir, cookie_file_name)
-
-        start_time = time.time()
-
-        try:
-          curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file, '-sL', '-w',
-            '%{http_code}', url, '--connect-timeout', self.curl_connection_timeout,
-            '-o', '/dev/null'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env)
-
-          curl_stdout, curl_stderr = curl.communicate()
-        finally:
-          if os.path.isfile(cookie_file):
-            os.remove(cookie_file)
-
-        # empty quotes evaluates to false
-        if curl_stderr:
-          error_msg = curl_stderr
-
-        # empty quotes evaluates to false
-        if curl_stdout:
-          response_code = int(curl_stdout)
-
-        time_millis = time.time() - start_time
+        response_code, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab, kerberos_principal, url,
+                                               "web_alert", kerberos_executable_search_paths, True, self.get_name())
       else:
         # kerberos is not involved; use urllib2
         response_code, time_millis, error_msg = self._make_web_request_urllib(url)

+ 15 - 15
ambari-agent/src/test/python/ambari_agent/TestAlerts.py

@@ -255,12 +255,12 @@ class TestAlerts(TestCase):
     cluster_configuration = self.__get_cluster_configuration()
     self.__update_cluster_configuration(cluster_configuration, configuration)
 
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     alert.set_helpers(collector, cluster_configuration)
     alert.set_cluster("c1", "c6401.ambari.apache.org")
 
     # trip an OK
-    ma_load_jmx_mock.return_value = [1, 25]
+    ma_load_jmx_mock.return_value = ([1, 25], None)
 
     alert.collect()
     alerts = collector.alerts()
@@ -269,7 +269,7 @@ class TestAlerts(TestCase):
     self.assertEquals('(Unit Tests) OK: 1 25 125', alerts[0]['text'])
 
     # trip a warning
-    ma_load_jmx_mock.return_value = [1, 75]
+    ma_load_jmx_mock.return_value = ([1, 75], None)
 
     alert.collect()
     alerts = collector.alerts()
@@ -278,7 +278,7 @@ class TestAlerts(TestCase):
     self.assertEquals('(Unit Tests) Warning: 1 75 175', alerts[0]['text'])
 
     # trip a critical now
-    ma_load_jmx_mock.return_value = [1, 150]
+    ma_load_jmx_mock.return_value = ([1, 150], None)
 
     alert.collect()
     alerts = collector.alerts()
@@ -289,12 +289,12 @@ class TestAlerts(TestCase):
     del definition_json['source']['jmx']['value']
     collector = AlertCollector()
 
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     alert.set_helpers(collector, cluster_configuration)
     alert.set_cluster("c1", "c6401.ambari.apache.org")
 
     # now try without any jmx value to compare to
-    ma_load_jmx_mock.return_value = [1, 25]
+    ma_load_jmx_mock.return_value = ([1, 25], None)
 
     alert.collect()
     alerts = collector.alerts()
@@ -307,13 +307,13 @@ class TestAlerts(TestCase):
   def test_alert_uri_structure(self, ma_load_jmx_mock):
     definition_json = self._get_metric_alert_definition()
 
-    ma_load_jmx_mock.return_value = [0,0]
+    ma_load_jmx_mock.return_value = ([0,0], None)
     
     # run the alert without specifying any keys; an exception should be thrown
     # indicating that there was no URI and the result is UNKNOWN
     collector = AlertCollector()
     cluster_configuration = self.__get_cluster_configuration()
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     alert.set_helpers(collector, cluster_configuration)
     alert.set_cluster("c1", "c6401.ambari.apache.org")
     alert.collect()
@@ -329,7 +329,7 @@ class TestAlerts(TestCase):
     cluster_configuration = self.__get_cluster_configuration()
     self.__update_cluster_configuration(cluster_configuration, configuration)
 
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     alert.set_helpers(collector, cluster_configuration)
     alert.set_cluster("c1", "c6401.ambari.apache.org")
     alert.collect()
@@ -345,7 +345,7 @@ class TestAlerts(TestCase):
     self.__update_cluster_configuration(cluster_configuration, configuration)
 
     collector = AlertCollector()
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     alert.set_helpers(collector, cluster_configuration)
     alert.set_cluster("c1", "c6401.ambari.apache.org")
     alert.collect()
@@ -361,7 +361,7 @@ class TestAlerts(TestCase):
     self.__update_cluster_configuration(cluster_configuration, configuration)
 
     collector = AlertCollector()
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     alert.set_helpers(collector, cluster_configuration)
     alert.set_cluster("c1", "c6401.ambari.apache.org")
     alert.collect()
@@ -378,7 +378,7 @@ class TestAlerts(TestCase):
     self.__update_cluster_configuration(cluster_configuration, configuration)
 
     collector = AlertCollector()
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     alert.set_helpers(collector, cluster_configuration)
     alert.set_cluster("c1", "c6401.ambari.apache.org")
     alert.collect()
@@ -628,7 +628,7 @@ class TestAlerts(TestCase):
     self.assertEquals(alert._get_reporting_text(alert.RESULT_CRITICAL), 'Connection failed to {1}')
 
     definition_json['source']['type'] = 'METRIC'
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     self.assertEquals(alert._get_reporting_text(alert.RESULT_OK), '{0}')
     self.assertEquals(alert._get_reporting_text(alert.RESULT_WARNING), '{0}')
     self.assertEquals(alert._get_reporting_text(alert.RESULT_CRITICAL), '{0}')
@@ -867,7 +867,7 @@ class TestAlerts(TestCase):
     cluster_configuration = self.__get_cluster_configuration()
     self.__update_cluster_configuration(cluster_configuration, configuration)
 
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     alert.set_helpers(collector, cluster_configuration)
     alert.set_cluster("c1", "c6401.ambari.apache.org")
 
@@ -965,7 +965,7 @@ class TestAlerts(TestCase):
 
     # the metric definition will not and should default to 5.0
     definition_json = self._get_metric_alert_definition()
-    alert = MetricAlert(definition_json, definition_json['source'])
+    alert = MetricAlert(definition_json, definition_json['source'], None)
     self.assertEquals(5.0, alert.connection_timeout)
 
 

+ 1 - 1
ambari-agent/src/test/python/ambari_agent/TestMetricAlert.py

@@ -205,7 +205,7 @@ class TestMetricAlert(TestCase):
     mock_collector = MagicMock()
     mock_collector.put = Mock(side_effect=collector_side_effect)
 
-    alert = MetricAlert(alert_meta, alert_source_meta)
+    alert = MetricAlert(alert_meta, alert_source_meta, None)
     alert.set_helpers(mock_collector, {'foo-site/bar': 12, 'foo-site/baz': 'asd'})
     alert.set_cluster(cluster, host)
 

+ 1 - 0
ambari-common/src/main/python/resource_management/libraries/functions/__init__.py

@@ -41,6 +41,7 @@ from resource_management.libraries.functions.constants import *
 from resource_management.libraries.functions.get_hdp_version import *
 from resource_management.libraries.functions.get_lzo_packages import *
 from resource_management.libraries.functions.setup_ranger_plugin import *
+from resource_management.libraries.functions.curl_krb_request import *
 
 IS_WINDOWS = platform.system() == "Windows"
 

+ 123 - 0
ambari-common/src/main/python/resource_management/libraries/functions/curl_krb_request.py

@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Ambari Agent
+
+"""
+
+__all__ = ["curl_krb_request"]
+import logging
+import os
+import uuid
+import time
+import subprocess
+
+from get_kinit_path import get_kinit_path
+from get_klist_path import get_klist_path
+# hashlib is supplied as of Python 2.5 as the replacement interface for md5
+# and other secure hashes.  In 2.6, md5 is deprecated.  Import hashlib if
+# available, avoiding a deprecation warning under 2.6.  Import md5 otherwise,
+# preserving 2.4 compatibility.
+try:
+  import hashlib
+  _md5 = hashlib.md5
+except ImportError:
+  import md5
+  _md5 = md5.new
+
+CONNECTION_TIMEOUT = 10
+
+logger = logging.getLogger()
+
+
+def curl_krb_request(tmp_dir, keytab, principal, url, cache_file_prefix, krb_exec_search_paths,
+                     return_only_http_code, alert_name):
+  # Create the kerberos credentials cache (ccache) file and set it in the environment to use
+  # when executing curl. Use the md5 hash of the combination of the principal and keytab file
+  # to generate a (relatively) unique cache filename so that we can use it as needed.
+  ccache_file_name = _md5("{0}|{1}".format(principal, keytab)).hexdigest()
+  ccache_file_path = "{0}{1}{2}_cc_{3}".format(tmp_dir, os.sep, cache_file_prefix, ccache_file_name)
+  kerberos_env = {'KRB5CCNAME': ccache_file_path}
+
+  # If there are no tickets in the cache or they are expired, perform a kinit, else use what
+  # is in the cache
+  if krb_exec_search_paths:
+    klist_path_local = get_klist_path(krb_exec_search_paths)
+  else:
+    klist_path_local = get_klist_path()
+
+  if os.system("{0} -s {1}".format(klist_path_local, ccache_file_path)) != 0:
+    if krb_exec_search_paths:
+      kinit_path_local = get_kinit_path(krb_exec_search_paths)
+    else:
+      kinit_path_local = get_kinit_path()
+    logger.debug("[Alert][{0}] Enabling Kerberos authentication via GSSAPI using ccache at {1}.".format(
+      alert_name, ccache_file_path))
+
+    os.system("{0} -l 5m -c {1} -kt {2} {3} > /dev/null".format(kinit_path_local, ccache_file_path, keytab, principal))
+  else:
+    logger.debug("[Alert][{0}] Kerberos authentication via GSSAPI already enabled using ccache at {1}.".format(
+      alert_name, ccache_file_path))
+
+  # check if cookies dir exists, if not then create it
+  cookies_dir = os.path.join(tmp_dir, "cookies")
+
+  if not os.path.exists(cookies_dir):
+    os.makedirs(cookies_dir)
+
+  cookie_file_name = str(uuid.uuid4())
+  cookie_file = os.path.join(cookies_dir, cookie_file_name)
+
+  start_time = time.time()
+  error_msg = None
+  try:
+    if return_only_http_code:
+      curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file, '-w',
+                             '%{http_code}', url, '--connect-timeout', str(CONNECTION_TIMEOUT),'-o', '/dev/null'],
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env)
+    else:
+      # returns response body
+      curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file,
+                             url, '--connect-timeout', str(CONNECTION_TIMEOUT)],
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env)
+
+    curl_stdout, curl_stderr = curl.communicate()
+  except Exception, exception:
+    if logger.isEnabledFor(logging.DEBUG):
+      logger.exception("[Alert][{0}] Unable to make a web request.".format(alert_name))
+    raise Exception(exception)
+  finally:
+    if os.path.isfile(cookie_file):
+      os.remove(cookie_file)
+
+  # empty quotes evaluates to false
+  if curl_stderr:
+    error_msg = curl_stderr
+
+  time_millis = time.time() - start_time
+
+  # empty quotes evaluates to false
+  if curl_stdout:
+    if return_only_http_code:
+      return (int(curl_stdout), error_msg, time_millis)
+    else:
+      return (curl_stdout, error_msg, time_millis)
+
+  logger.debug("[Alert][{0}] Curl response is empty! Please take a look at error message: ".
+               format(alert_name, str(error_msg)))
+  return ("", error_msg, time_millis)

+ 14 - 0
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json

@@ -127,6 +127,8 @@
           "uri": {
             "http": "{{hdfs-site/dfs.namenode.http-address}}",
             "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
+            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
             "https_property": "{{hdfs-site/dfs.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0,
@@ -172,6 +174,8 @@
           "uri": {
             "http": "{{hdfs-site/dfs.namenode.http-address}}",
             "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
+            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
             "https_property": "{{hdfs-site/dfs.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0,
@@ -217,6 +221,8 @@
           "uri": {
             "http": "{{hdfs-site/dfs.namenode.http-address}}",
             "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
+            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
             "https_property": "{{hdfs-site/dfs.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0,
@@ -262,6 +268,8 @@
           "uri": {
             "http": "{{hdfs-site/dfs.namenode.http-address}}",
             "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
+            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
             "https_property": "{{hdfs-site/dfs.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0,
@@ -307,6 +315,8 @@
           "uri": {
             "http": "{{hdfs-site/dfs.namenode.http-address}}",
             "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
+            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
             "https_property": "{{hdfs-site/dfs.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0,
@@ -351,6 +361,8 @@
           "uri": {
             "http": "{{hdfs-site/dfs.namenode.http-address}}",
             "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
+            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
             "https_property": "{{hdfs-site/dfs.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0,
@@ -605,6 +617,8 @@
           "uri": {
             "http": "{{hdfs-site/dfs.datanode.http.address}}",
             "https": "{{hdfs-site/dfs.datanode.https.address}}",
+            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
+            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
             "https_property": "{{hdfs-site/dfs.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0

+ 40 - 3
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py

@@ -21,6 +21,10 @@ limitations under the License.
 import time
 import urllib2
 import json
+import logging
+
+from resource_management.libraries.functions.curl_krb_request import curl_krb_request
+from resource_management.core.environment import Environment
 
 LABEL = 'Last Checkpoint: [{h} hours, {m} minutes, {tx} transactions]'
 
@@ -42,13 +46,19 @@ CHECKPOINT_PERIOD_DEFAULT = 21600
 CONNECTION_TIMEOUT_KEY = 'connection.timeout'
 CONNECTION_TIMEOUT_DEFAULT = 5.0
 
+KERBEROS_KEYTAB = '{{hdfs-site/dfs.web.authentication.kerberos.keytab}}'
+KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+
+logger = logging.getLogger()
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
   """
   return (NN_HTTP_ADDRESS_KEY, NN_HTTPS_ADDRESS_KEY, NN_HTTP_POLICY_KEY, 
-      NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY)      
+      NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY)
   
 
 def execute(configurations={}, parameters={}, host_name=None):
@@ -87,6 +97,19 @@ def execute(configurations={}, parameters={}, host_name=None):
   if NN_CHECKPOINT_PERIOD_KEY in configurations:
     checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]
 
+  security_enabled = False
+  if SECURITY_ENABLED_KEY in configurations:
+    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+  kerberos_keytab = None
+  if KERBEROS_KEYTAB in configurations:
+    kerberos_keytab = configurations[KERBEROS_KEYTAB]
+
+  kerberos_principal = None
+  if KERBEROS_PRINCIPAL in configurations:
+    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
+    kerberos_principal = kerberos_principal.replace('_HOST', host_name)
+
   # parse script arguments
   connection_timeout = CONNECTION_TIMEOUT_DEFAULT
   if CONNECTION_TIMEOUT_KEY in parameters:
@@ -118,10 +141,24 @@ def execute(configurations={}, parameters={}, host_name=None):
   result_code = "OK"
 
   try:
-    last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,
+    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
+      env = Environment.get_instance()
+      last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab,
+                                    kerberos_principal, last_checkpoint_time_qry,"checkpoint_time_alert", None, False,
+                                    "NameNode Last Checkpoint")
+      last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response)
+      last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"])
+
+      journal_transaction_info_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab,
+                                      kerberos_principal, journal_transaction_info_qry,"checkpoint_time_alert", None,
+                                      False, "NameNode Last Checkpoint")
+      journal_transaction_info_response_json = json.loads(journal_transaction_info_response)
+      journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"]
+    else:
+      last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,
       "LastCheckpointTime", connection_timeout))
 
-    journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,
+      journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,
       "JournalTransactionInfo", connection_timeout)
 
     journal_transaction_info_dict = json.loads(journal_transaction_info)

+ 33 - 2
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py

@@ -20,6 +20,10 @@ limitations under the License.
 
 import urllib2
 import json
+import logging
+
+from resource_management.libraries.functions.curl_krb_request import curl_krb_request
+from resource_management.core.environment import Environment
 
 RESULT_STATE_OK = 'OK'
 RESULT_STATE_CRITICAL = 'CRITICAL'
@@ -35,16 +39,22 @@ NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
 NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
 DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
 
+KERBEROS_KEYTAB = '{{hdfs-site/dfs.web.authentication.kerberos.keytab}}'
+KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+
 CONNECTION_TIMEOUT_KEY = 'connection.timeout'
 CONNECTION_TIMEOUT_DEFAULT = 5.0
 
+logger = logging.getLogger()
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
   """
   return (HDFS_SITE_KEY, NAMESERVICE_KEY, NN_HTTP_ADDRESS_KEY,
-  NN_HTTPS_ADDRESS_KEY, DFS_POLICY_KEY)
+  NN_HTTPS_ADDRESS_KEY, DFS_POLICY_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY)
   
 
 def execute(configurations={}, parameters={}, host_name=None):
@@ -72,6 +82,19 @@ def execute(configurations={}, parameters={}, host_name=None):
   if CONNECTION_TIMEOUT_KEY in parameters:
     connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
 
+  security_enabled = False
+  if SECURITY_ENABLED_KEY in configurations:
+    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+  kerberos_keytab = None
+  if KERBEROS_KEYTAB in configurations:
+    kerberos_keytab = configurations[KERBEROS_KEYTAB]
+
+  kerberos_principal = None
+  if KERBEROS_PRINCIPAL in configurations:
+    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
+    kerberos_principal = kerberos_principal.replace('_HOST', host_name)
+
 
   # determine whether or not SSL is enabled
   is_ssl_enabled = False
@@ -113,7 +136,15 @@ def execute(configurations={}, parameters={}, host_name=None):
 
       try:
         jmx_uri = jmx_uri_fragment.format(value)
-        state = get_value_from_jmx(jmx_uri, 'State', connection_timeout)
+        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
+          env = Environment.get_instance()
+          state_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
+                                                    jmx_uri,"ha_nn_health", None, False,
+                                                    "NameNode High Availability Health")
+          state_response_json = json.loads(state_response)
+          state = state_response_json["beans"][0]['State']
+        else:
+          state = get_value_from_jmx(jmx_uri, 'State', connection_timeout)
 
         if state == HDFS_NN_STATE_ACTIVE:
           active_namenodes.append(value)

+ 8 - 0
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/alerts.json

@@ -43,6 +43,8 @@
           "type": "METRIC",
           "uri": {
             "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
+            "kerberos_keytab": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-keytab-file}}",
+            "kerberos_principal": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-principal}}",
             "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
             "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
@@ -83,6 +85,8 @@
           "uri": {
             "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
             "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
+            "kerberos_keytab": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-keytab-file}}",
+            "kerberos_principal": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-principal}}",
             "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0
@@ -270,6 +274,8 @@
           "uri": {
             "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}",
             "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
+            "kerberos_keytab": "{{yarn-site/yarn.resourcemanager.webapp.spnego-keytab-file}}",
+            "kerberos_principal": "{{yarn-site/yarn.resourcemanager.webapp.spnego-principal}}",
             "https_property": "{{yarn-site/yarn.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0,
@@ -314,6 +320,8 @@
           "uri": {
             "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}",
             "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
+            "kerberos_keytab": "{{yarn-site/yarn.resourcemanager.webapp.spnego-keytab-file}}",
+            "kerberos_principal": "{{yarn-site/yarn.resourcemanager.webapp.spnego-principal}}",
             "https_property": "{{yarn-site/yarn.http.policy}}",
             "https_property_value": "HTTPS_ONLY",
             "connection_timeout": 5.0,

+ 30 - 4
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py

@@ -23,6 +23,8 @@ import socket
 import urllib2
 from ambari_commons import OSCheck
 from ambari_commons.inet_utils import resolve_address
+from resource_management.libraries.functions.curl_krb_request import curl_krb_request
+from resource_management.core.environment import Environment
 
 RESULT_CODE_OK = 'OK'
 RESULT_CODE_CRITICAL = 'CRITICAL'
@@ -38,6 +40,10 @@ CRITICAL_HTTP_STATUS_MESSAGE = 'HTTP {0} returned from {1} ({2})'
 CRITICAL_NODEMANAGER_STATUS_MESSAGE = 'NodeManager returned an unexpected status of "{0}"'
 CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager health from unexpected JSON response'
 
+KERBEROS_KEYTAB = '{{yarn-site/yarn.nodemanager.webapp.spnego-keytab-file}}'
+KERBEROS_PRINCIPAL = '{{yarn-site/yarn.nodemanager.webapp.spnego-principal}}'
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+
 NODEMANAGER_DEFAULT_PORT = 8042
 
 CONNECTION_TIMEOUT_KEY = 'connection.timeout'
@@ -49,7 +55,7 @@ def get_tokens():
   to build the dictionary passed into execute
   """
   return (NODEMANAGER_HTTP_ADDRESS_KEY,NODEMANAGER_HTTPS_ADDRESS_KEY,
-  YARN_HTTP_POLICY_KEY)
+  YARN_HTTP_POLICY_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY)
   
 
 def execute(configurations={}, parameters={}, host_name=None):
@@ -71,6 +77,19 @@ def execute(configurations={}, parameters={}, host_name=None):
   https_uri = None
   http_policy = 'HTTP_ONLY'
 
+  security_enabled = False
+  if SECURITY_ENABLED_KEY in configurations:
+    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+  kerberos_keytab = None
+  if KERBEROS_KEYTAB in configurations:
+    kerberos_keytab = configurations[KERBEROS_KEYTAB]
+
+  kerberos_principal = None
+  if KERBEROS_PRINCIPAL in configurations:
+    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
+    kerberos_principal = kerberos_principal.replace('_HOST', host_name)
+
   if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
     http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]
 
@@ -116,8 +135,16 @@ def execute(configurations={}, parameters={}, host_name=None):
   query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)
 
   try:
-    # execute the query for the JSON that includes templeton status
-    url_response = urllib2.urlopen(query, timeout=connection_timeout)
+    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
+      env = Environment.get_instance()
+      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
+                                                query, "nm_health_alert", None, False, "NodeManager Health")
+
+      json_response = json.loads(url_response)
+    else:
+      # execute the query for the JSON that includes templeton status
+      url_response = urllib2.urlopen(query, timeout=connection_timeout)
+      json_response = json.loads(url_response.read())
   except urllib2.HTTPError, httpError:
     label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
       str(httpError))
@@ -129,7 +156,6 @@ def execute(configurations={}, parameters={}, host_name=None):
 
   # URL response received, parse it
   try:
-    json_response = json.loads(url_response.read())
     node_healthy = json_response['nodeInfo']['nodeHealthy']
     node_healthy_report = json_response['nodeInfo']['healthReport']
 

+ 56 - 6
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py

@@ -20,8 +20,11 @@ limitations under the License.
 
 import urllib2
 import json
+import logging
 
 from ambari_commons.urllib_handlers import RefreshHeaderProcessor
+from resource_management.libraries.functions.curl_krb_request import curl_krb_request
+from resource_management.core.environment import Environment
 
 ERROR_LABEL = '{0} NodeManager{1} {2} unhealthy.'
 OK_LABEL = 'All NodeManagers are healthy'
@@ -30,16 +33,22 @@ NODEMANAGER_HTTP_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.address}
 NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.https.address}}'
 YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
 
+KERBEROS_KEYTAB = '{{yarn-site/yarn.nodemanager.webapp.spnego-keytab-file}}'
+KERBEROS_PRINCIPAL = '{{yarn-site/yarn.nodemanager.webapp.spnego-principal}}'
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+
 CONNECTION_TIMEOUT_KEY = 'connection.timeout'
 CONNECTION_TIMEOUT_DEFAULT = 5.0
-  
+
+logger = logging.getLogger()
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
   """
   return NODEMANAGER_HTTP_ADDRESS_KEY, NODEMANAGER_HTTPS_ADDRESS_KEY, \
-    YARN_HTTP_POLICY_KEY
+    YARN_HTTP_POLICY_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY
 
 
 def execute(configurations={}, parameters={}, host_name=None):
@@ -59,7 +68,20 @@ def execute(configurations={}, parameters={}, host_name=None):
   http_uri = None
   https_uri = None
   http_policy = 'HTTP_ONLY'
-  
+
+  security_enabled = False
+  if SECURITY_ENABLED_KEY in configurations:
+    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+  kerberos_keytab = None
+  if KERBEROS_KEYTAB in configurations:
+    kerberos_keytab = configurations[KERBEROS_KEYTAB]
+
+  kerberos_principal = None
+  if KERBEROS_PRINCIPAL in configurations:
+    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
+    kerberos_principal = kerberos_principal.replace('_HOST', host_name)
+
   if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
     http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]
 
@@ -78,16 +100,44 @@ def execute(configurations={}, parameters={}, host_name=None):
   uri = http_uri
   if http_policy == 'HTTPS_ONLY':
     scheme = 'https'
-    
+
     if https_uri is not None:
       uri = https_uri
 
+  uri = str(host_name) + ":" + uri.split(":")[1]
   live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(scheme, uri)
-
+  convert_to_json_failed = False
+  response_code = None
   try:
-    live_nodemanagers = json.loads(get_value_from_jmx(live_nodemanagers_qry,
+    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
+      env = Environment.get_instance()
+      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
+                                              live_nodemanagers_qry, "nm_health_summary_alert", None, False,
+                                              "NodeManager Health Summary")
+      try:
+        url_response_json = json.loads(url_response)
+        live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"])
+      except ValueError, error:
+        convert_to_json_failed = True
+        if logger.isEnabledFor(logging.DEBUG):
+          logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
+          format("NodeManager Health Summary", str(error)))
+
+      if convert_to_json_failed:
+        response_code, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
+                                                    live_nodemanagers_qry, "nm_health_summary_alert", None, True,
+                                                    "NodeManager Health Summary")
+    else:
+      live_nodemanagers = json.loads(get_value_from_jmx(live_nodemanagers_qry,
       "LiveNodeManagers", connection_timeout))
 
+    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
+      if response_code in [200, 307] and convert_to_json_failed:
+        return ('UNKNOWN', ['HTTP {0} response (metrics unavailable)'.format(str(response_code))])
+      elif convert_to_json_failed and response_code not in [200, 307]:
+        raise Exception("[Alert][NodeManager Health Summary] Getting data from {0} failed with http code {1}".format(
+          str(live_nodemanagers_qry), str(response_code)))
+
     unhealthy_count = 0
 
     for nodemanager in live_nodemanagers: