Browse Source

AMBARI-10021 - Python Does Not Close Alert TCP Connections Reliably

Jonathan Hurley 10 years ago
parent
commit
516d718fc9
24 changed files with 250 additions and 126 deletions
  1. 13 4
      ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
  2. 77 39
      ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py
  3. 0 3
      ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/alerts.json
  4. 1 1
      ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/alerts.json
  5. 0 6
      ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/alerts.json
  6. 4 4
      ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
  7. 15 5
      ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
  8. 15 5
      ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
  9. 0 3
      ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
  10. 0 3
      ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py
  11. 8 1
      ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py
  12. 1 1
      ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/alerts.json
  13. 1 1
      ambari-server/src/main/resources/common-services/STORM/0.9.1.2.1/alerts.json
  14. 4 4
      ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/alerts.json
  15. 20 5
      ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
  16. 18 9
      ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
  17. 0 3
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HBASE/alerts.json
  18. 4 4
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json
  19. 15 5
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py
  20. 15 5
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py
  21. 1 4
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/package/files/alert_hive_thrift_port.py
  22. 9 2
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py
  23. 4 4
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/alerts.json
  24. 25 5
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py

+ 13 - 4
ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py

@@ -154,10 +154,19 @@ class MetricAlert(BaseAlert):
 
 
       # use a customer header processor that will look for the non-standard
       # use a customer header processor that will look for the non-standard
       # "Refresh" header and attempt to follow the redirect
       # "Refresh" header and attempt to follow the redirect
-      url_opener = urllib2.build_opener(RefreshHeaderProcessor())
-      response = url_opener.open(url)
-
-      content = response.read()
+      response = None
+      try:
+        url_opener = urllib2.build_opener(RefreshHeaderProcessor())
+        response = url_opener.open(url)
+        content = response.read()
+      finally:
+        # explicitely close the connection as we've seen python hold onto these
+        if response is not None:
+          try:
+            response.close()
+          except:
+            logger.debug("[Alert][{0}] Unable to close JMX URL connection to {1}".format
+              (self.get_name(), url))
 
 
       json_response = json.loads(content)
       json_response = json.loads(content)
       json_data = json_response['beans'][0]
       json_data = json_response['beans'][0]

+ 77 - 39
ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py

@@ -22,7 +22,9 @@ import logging
 import time
 import time
 import subprocess
 import subprocess
 import os
 import os
+import urllib2
 import uuid
 import uuid
+
 from  tempfile import gettempdir
 from  tempfile import gettempdir
 from alerts.base_alert import BaseAlert
 from alerts.base_alert import BaseAlert
 from collections import namedtuple
 from collections import namedtuple
@@ -45,10 +47,13 @@ except ImportError:
 
 
 logger = logging.getLogger()
 logger = logging.getLogger()
 
 
-CURL_CONNECTION_TIMEOUT = '20'
+CONNECTION_TIMEOUT = 10.0
+CURL_CONNECTION_TIMEOUT = "10"
+
+WebResponse = namedtuple('WebResponse', 'status_code time_millis error_msg')
 
 
 class WebAlert(BaseAlert):
 class WebAlert(BaseAlert):
-  
+
   def __init__(self, alert_meta, alert_source_meta, config):
   def __init__(self, alert_meta, alert_source_meta, config):
     super(WebAlert, self).__init__(alert_meta, alert_source_meta)
     super(WebAlert, self).__init__(alert_meta, alert_source_meta)
     
     
@@ -72,6 +77,10 @@ class WebAlert(BaseAlert):
       self.get_name(), alert_uri.uri, str(alert_uri.is_ssl_enabled)))
       self.get_name(), alert_uri.uri, str(alert_uri.is_ssl_enabled)))
 
 
     url = self._build_web_query(alert_uri)
     url = self._build_web_query(alert_uri)
+
+    # substitute 0.0.0.0 in url with actual fqdn
+    url = url.replace('0.0.0.0', self.host_name)
+
     web_response = self._make_web_request(url)
     web_response = self._make_web_request(url)
     status_code = web_response.status_code
     status_code = web_response.status_code
     time_seconds = web_response.time_millis / 1000
     time_seconds = web_response.time_millis / 1000
@@ -79,10 +88,10 @@ class WebAlert(BaseAlert):
 
 
     if status_code == 0:
     if status_code == 0:
       return (self.RESULT_CRITICAL, [status_code, url, time_seconds, error_message])
       return (self.RESULT_CRITICAL, [status_code, url, time_seconds, error_message])
-    
+
     if status_code < 400:
     if status_code < 400:
       return (self.RESULT_OK, [status_code, url, time_seconds])
       return (self.RESULT_OK, [status_code, url, time_seconds])
-    
+
     return (self.RESULT_WARNING, [status_code, url, time_seconds])
     return (self.RESULT_WARNING, [status_code, url, time_seconds])
 
 
 
 
@@ -116,9 +125,11 @@ class WebAlert(BaseAlert):
     scheme = 'http'
     scheme = 'http'
     if alert_uri.is_ssl_enabled is True:
     if alert_uri.is_ssl_enabled is True:
       scheme = 'https'
       scheme = 'https'
+
     if OSCheck.is_windows_family():
     if OSCheck.is_windows_family():
       # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
       # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
       host = resolve_address(host)
       host = resolve_address(host)
+
     return "{0}://{1}:{2}".format(scheme, host, str(port))
     return "{0}://{1}:{2}".format(scheme, host, str(port))
 
 
 
 
@@ -127,10 +138,7 @@ class WebAlert(BaseAlert):
     Makes an http(s) request to a web resource and returns the http code. If
     Makes an http(s) request to a web resource and returns the http code. If
     there was an error making the request, return 0 for the status code.
     there was an error making the request, return 0 for the status code.
     """    
     """    
-    WebResponse = namedtuple('WebResponse', 'status_code time_millis error_msg')
-    
-    time_millis = 0
-    
+
     try:
     try:
       kerberos_keytab = None
       kerberos_keytab = None
       kerberos_principal = None
       kerberos_principal = None
@@ -164,49 +172,79 @@ class WebAlert(BaseAlert):
 
 
         if os.system("{0} -s {1}".format(klist_path_local, ccache_file_path)) != 0:
         if os.system("{0} -s {1}".format(klist_path_local, ccache_file_path)) != 0:
           kinit_path_local = get_kinit_path()
           kinit_path_local = get_kinit_path()
-          logger.debug("[Alert][{0}] Enabling Kerberos authentication via GSSAPI using ccache at {1}."
-                       .format(self.get_name(), ccache_file_path))
-          os.system("{0} -l 5m -c {1} -kt {2} {3} > /dev/null".format(kinit_path_local, ccache_file_path, kerberos_keytab, kerberos_principal))
+          logger.debug("[Alert][{0}] Enabling Kerberos authentication via GSSAPI using ccache at {1}.".format(
+            self.get_name(), ccache_file_path))
+
+          os.system("{0} -l 5m -c {1} -kt {2} {3} > /dev/null".format(
+            kinit_path_local, ccache_file_path, kerberos_keytab,
+            kerberos_principal))
         else:
         else:
-          logger.debug("[Alert][{0}] Kerberos authentication via GSSAPI already enabled using ccache at {1}."
-                       .format(self.get_name(), ccache_file_path))
-      else:
-        kerberos_env = None
+          logger.debug("[Alert][{0}] Kerberos authentication via GSSAPI already enabled using ccache at {1}.".format(
+            self.get_name(), ccache_file_path))
 
 
-      # check if cookies dir exists, if not then create it
-      tmp_dir = self.config.get('agent', 'tmp_dir')
-      cookies_dir = os.path.join(tmp_dir, "cookies")
+        # check if cookies dir exists, if not then create it
+        tmp_dir = self.config.get('agent', 'tmp_dir')
+        cookies_dir = os.path.join(tmp_dir, "cookies")
 
 
-      if not os.path.exists(cookies_dir):
-        os.makedirs(cookies_dir)
+        if not os.path.exists(cookies_dir):
+          os.makedirs(cookies_dir)
 
 
-      # substitute 0.0.0.0 in url with actual fqdn
-      url = url.replace('0.0.0.0', self.host_name)
-      cookie_file_name = str(uuid.uuid4())
-      cookie_file = os.path.join(cookies_dir, cookie_file_name)
-      start_time = time.time()
-      curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file, '-sL', '-w',
-        '%{http_code}', url, '--connect-timeout', CURL_CONNECTION_TIMEOUT,
-        '-o', '/dev/null'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env)
+        cookie_file_name = str(uuid.uuid4())
+        cookie_file = os.path.join(cookies_dir, cookie_file_name)
 
 
-      out, err = curl.communicate()
+        start_time = time.time()
 
 
-      if err != '':
-        raise Exception(err)
+        try:
+          curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file, '-sL', '-w',
+            '%{http_code}', url, '--connect-timeout', CURL_CONNECTION_TIMEOUT,
+            '-o', '/dev/null'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env)
 
 
-      response_code = int(out)
-      time_millis = time.time() - start_time
-    except Exception, exc:
+          out, err = curl.communicate()
+        finally:
+          if os.path.isfile(cookie_file):
+            os.remove(cookie_file)
+
+        if err != '':
+          raise Exception(err)
+
+        response_code = int(out)
+        time_millis = time.time() - start_time
+      else:
+        # kerberos is not involved; use urllib2
+        response_code, time_millis = self._make_web_request_urllib(url)
+
+      return WebResponse(status_code=response_code, time_millis=time_millis,
+        error_msg=None)
+
+    except Exception, exception:
       if logger.isEnabledFor(logging.DEBUG):
       if logger.isEnabledFor(logging.DEBUG):
         logger.exception("[Alert][{0}] Unable to make a web request.".format(self.get_name()))
         logger.exception("[Alert][{0}] Unable to make a web request.".format(self.get_name()))
 
 
-      return WebResponse(status_code=0, time_millis=0, error_msg=str(exc))
+      return WebResponse(status_code=0, time_millis=0, error_msg=str(exception))
 
 
-    finally:
-      if os.path.isfile(cookie_file):
-        os.remove(cookie_file)
 
 
-    return WebResponse(status_code=response_code, time_millis=time_millis, error_msg=None)
+  def _make_web_request_urllib(self, url):
+    """
+    Make a web request using urllib2. This function does not handle exceptions.
+    :param url: the URL to request
+    :return: a tuple of the response code and the total time in ms
+    """
+    response = None
+    start_time = time.time()
+
+    try:
+      response = urllib2.urlopen(url, timeout=CONNECTION_TIMEOUT)
+      response_code = response.getcode()
+      time_millis = time.time() - start_time
+
+      return response_code, time_millis
+    finally:
+      if response is not None:
+        try:
+          response.close()
+        except Exception, exception:
+          if logger.isEnabledFor(logging.DEBUG):
+            logger.exception("[Alert][{0}] Unable to close socket connection".format(self.get_name()))
 
 
 
 
   def _get_reporting_text(self, state):
   def _get_reporting_text(self, state):

+ 0 - 3
ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/alerts.json

@@ -90,9 +90,6 @@
           "type": "METRIC",
           "type": "METRIC",
           "uri": {
           "uri": {
             "http": "{{ams-hbase-site/hbase.master.info.port}}",
             "http": "{{ams-hbase-site/hbase.master.info.port}}",
-            "https": "{{ams-hbase-site/hbase.master.info.port}}",
-            "https_property": "{{cluster-env/security_enabled}}",
-            "https_property_value": "true",
             "default_port": 61310
             "default_port": 61310
           },
           },
           "reporting": {
           "reporting": {

+ 1 - 1
ambari-server/src/main/resources/common-services/FALCON/0.5.0.2.1/alerts.json

@@ -50,7 +50,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }

+ 0 - 6
ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/alerts.json

@@ -64,9 +64,6 @@
           "type": "METRIC",
           "type": "METRIC",
           "uri": {
           "uri": {
             "http": "{{hbase-site/hbase.master.info.port}}",
             "http": "{{hbase-site/hbase.master.info.port}}",
-            "https": "{{hbase-site/hbase.master.info.port}}",
-            "https_property": "{{cluster-env/security_enabled}}",
-            "https_property_value": "true",
             "default_port": 60010
             "default_port": 60010
           },
           },
           "reporting": {
           "reporting": {
@@ -103,9 +100,6 @@
           "type": "METRIC",
           "type": "METRIC",
           "uri": {
           "uri": {
             "http": "{{hbase-site/hbase.master.info.port}}",
             "http": "{{hbase-site/hbase.master.info.port}}",
-            "https": "{{hbase-site/hbase.master.info.port}}",
-            "https_property": "{{cluster-env/security_enabled}}",
-            "https_property_value": "true",
             "default_port": 60010
             "default_port": 60010
           },
           },
           "reporting": {
           "reporting": {

+ 4 - 4
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json

@@ -109,7 +109,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }
@@ -359,11 +359,11 @@
               "text": "All {2} DataNode(s) are healthy"
               "text": "All {2} DataNode(s) are healthy"
             },
             },
             "warning": {
             "warning": {
-              "text": "Dead/Stale Datanode(s): {0}/{1} out of {2}",
+              "text": "DataNode Health: [Live={2}, Stale={1}, Dead={0}]",
               "value": 1
               "value": 1
             },
             },
             "critical": {
             "critical": {
-              "text": "Dead/Stale Datanode(s): {0}/{1} out of {2}",
+              "text": "DataNode Health: [Live={2}, Stale={1}, Dead={0}]",
               "value": 1
               "value": 1
             },
             },
             "units" : "DNs"
             "units" : "DNs"
@@ -512,7 +512,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }

+ 15 - 5
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py

@@ -129,8 +129,18 @@ def get_time(delta):
   return {'h':h, 'm':m}
   return {'h':h, 'm':m}
 
 
 
 
-def get_value_from_jmx(qry, property):
-  response = urllib2.urlopen(qry)
-  data=response.read()
-  data_dict = json.loads(data)
-  return data_dict["beans"][0][property]
+def get_value_from_jmx(query, jmx_property):
+  response = None
+  
+  try:
+    response = urllib2.urlopen(query)
+    data = response.read()
+
+    data_dict = json.loads(data)
+    return data_dict["beans"][0][jmx_property]
+  finally:
+    if response is not None:
+      try:
+        response.close()
+      except:
+        pass

+ 15 - 5
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py

@@ -159,8 +159,18 @@ def execute(parameters=None, host_name=None):
       return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
       return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
 
 
 
 
-def get_value_from_jmx(qry, property):
-  response = urllib2.urlopen(qry)
-  data=response.read()
-  data_dict = json.loads(data)
-  return data_dict["beans"][0][property]
+def get_value_from_jmx(query, jmx_property):
+  response = None
+  
+  try:
+    response = urllib2.urlopen(query)
+    data = response.read()
+
+    data_dict = json.loads(data)
+    return data_dict["beans"][0][jmx_property]
+  finally:
+    if response is not None:
+      try:
+        response.close()
+      except:
+        pass

+ 0 - 3
ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py

@@ -18,11 +18,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 limitations under the License.
 """
 """
 
 
-import json
 import socket
 import socket
 import time
 import time
-import traceback
-import urllib2
 from resource_management.libraries.functions import format
 from resource_management.libraries.functions import format
 from resource_management.libraries.functions import get_kinit_path
 from resource_management.libraries.functions import get_kinit_path
 from resource_management.core.resources import Execute
 from resource_management.core.resources import Execute

+ 0 - 3
ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py

@@ -18,11 +18,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 limitations under the License.
 """
 """
 
 
-import json
 import socket
 import socket
 import time
 import time
-import traceback
-import urllib2
 from resource_management.libraries.functions import hive_check
 from resource_management.libraries.functions import hive_check
 from resource_management.libraries.functions import format
 from resource_management.libraries.functions import format
 from resource_management.libraries.functions import get_kinit_path
 from resource_management.libraries.functions import get_kinit_path

+ 8 - 1
ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py

@@ -96,7 +96,6 @@ def execute(parameters=None, host_name=None):
   if security_enabled:
   if security_enabled:
     if WEBHCAT_KEYTAB_KEY not in parameters or WEBHCAT_PRINCIPAL_KEY not in parameters:
     if WEBHCAT_KEYTAB_KEY not in parameters or WEBHCAT_PRINCIPAL_KEY not in parameters:
       return (RESULT_CODE_UNKNOWN, [str(parameters)])
       return (RESULT_CODE_UNKNOWN, [str(parameters)])
-      # return (RESULT_CODE_UNKNOWN, ['The WebHCat keytab and principal are required parameters when security is enabled.'])
 
 
     try:
     try:
       webhcat_keytab = parameters[WEBHCAT_KEYTAB_KEY]
       webhcat_keytab = parameters[WEBHCAT_KEYTAB_KEY]
@@ -164,6 +163,8 @@ def execute(parameters=None, host_name=None):
     except Exception, exception:
     except Exception, exception:
       return (RESULT_CODE_CRITICAL, [str(exception)])
       return (RESULT_CODE_CRITICAL, [str(exception)])
   else:
   else:
+    url_response = None
+    
     try:
     try:
       # execute the query for the JSON that includes WebHCat status
       # execute the query for the JSON that includes WebHCat status
       start_time = time.time()
       start_time = time.time()
@@ -177,6 +178,12 @@ def execute(parameters=None, host_name=None):
     except:
     except:
       label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
       label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
       return (RESULT_CODE_CRITICAL, [label])
       return (RESULT_CODE_CRITICAL, [label])
+    finally:
+      if url_response is not None:
+        try:
+          url_response.close()
+        except:
+          pass
 
 
 
 
   # if status is not in the response, we can't do any check; return CRIT
   # if status is not in the response, we can't do any check; return CRIT

+ 1 - 1
ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/alerts.json

@@ -23,7 +23,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }

+ 1 - 1
ambari-server/src/main/resources/common-services/STORM/0.9.1.2.1/alerts.json

@@ -73,7 +73,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }

+ 4 - 4
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/alerts.json

@@ -26,7 +26,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }
@@ -189,7 +189,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }
@@ -237,7 +237,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }
@@ -366,7 +366,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }

+ 20 - 5
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py

@@ -23,6 +23,7 @@ import socket
 import urllib2
 import urllib2
 from ambari_commons import OSCheck
 from ambari_commons import OSCheck
 from ambari_commons.inet_utils import resolve_address
 from ambari_commons.inet_utils import resolve_address
+
 RESULT_CODE_OK = 'OK'
 RESULT_CODE_OK = 'OK'
 RESULT_CODE_CRITICAL = 'CRITICAL'
 RESULT_CODE_CRITICAL = 'CRITICAL'
 RESULT_CODE_UNKNOWN = 'UNKNOWN'
 RESULT_CODE_UNKNOWN = 'UNKNOWN'
@@ -32,7 +33,8 @@ NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.nodemanager.webapp.https.addre
 YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
 YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
 
 
 OK_MESSAGE = 'NodeManager Healthy'
 OK_MESSAGE = 'NodeManager Healthy'
-CRITICAL_CONNECTION_MESSAGE = 'Connection failed to {0}'
+CRITICAL_CONNECTION_MESSAGE = 'Connection failed to {0} ({1})'
+CRITICAL_HTTP_STATUS_MESSAGE = 'HTTP {0} returned from {1} ({2})'
 CRITICAL_NODEMANAGER_STATUS_MESSAGE = 'NodeManager returned an unexpected status of "{0}"'
 CRITICAL_NODEMANAGER_STATUS_MESSAGE = 'NodeManager returned an unexpected status of "{0}"'
 CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager health from unexpected JSON response'
 CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager health from unexpected JSON response'
 
 
@@ -93,18 +95,25 @@ def execute(parameters=None, host_name=None):
       host_name = socket.getfqdn()
       host_name = socket.getfqdn()
 
 
     uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
     uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
+    
   if OSCheck.is_windows_family():
   if OSCheck.is_windows_family():
     uri_host, uri_port = uri.split(':')
     uri_host, uri_port = uri.split(':')
     # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
     # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
     uri_host = resolve_address(uri_host)
     uri_host = resolve_address(uri_host)
     uri = '{0}:{1}'.format(uri_host, uri_port)
     uri = '{0}:{1}'.format(uri_host, uri_port)
+
+  query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)
+
   try:
   try:
-    query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)
-    
     # execute the query for the JSON that includes templeton status
     # execute the query for the JSON that includes templeton status
     url_response = urllib2.urlopen(query)
     url_response = urllib2.urlopen(query)
-  except:
-    label = CRITICAL_CONNECTION_MESSAGE.format(uri)
+  except urllib2.HTTPError, httpError:
+    label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
+      str(httpError))
+
+    return (RESULT_CODE_CRITICAL, [label])
+  except Exception, exception:
+    label = CRITICAL_CONNECTION_MESSAGE.format(query, str(exception))
     return (RESULT_CODE_CRITICAL, [label])
     return (RESULT_CODE_CRITICAL, [label])
 
 
   # URL response received, parse it
   # URL response received, parse it
@@ -116,6 +125,12 @@ def execute(parameters=None, host_name=None):
     node_healthy = str(node_healthy)
     node_healthy = str(node_healthy)
   except:
   except:
     return (RESULT_CODE_CRITICAL, [query])
     return (RESULT_CODE_CRITICAL, [query])
+  finally:
+    if url_response is not None:
+      try:
+        url_response.close()
+      except:
+        pass
 
 
   # proper JSON received, compare against known value
   # proper JSON received, compare against known value
   if node_healthy.lower() == 'true':
   if node_healthy.lower() == 'true':

+ 18 - 9
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py

@@ -102,12 +102,21 @@ def execute(parameters=None, host_name=None):
   return ((result_code, [label]))
   return ((result_code, [label]))
 
 
 
 
-def get_value_from_jmx(url, property):
-  # use a customer header process that will look for the non-standard
-  # "Refresh" header and attempt to follow the redirect
-  url_opener = urllib2.build_opener(RefreshHeaderProcessor())
-  response = url_opener.open(url)
-
-  data=response.read()
-  data_dict = json.loads(data)
-  return data_dict["beans"][0][property]
+def get_value_from_jmx(query, jmx_property):
+  response = None
+  
+  try:
+    # use a customer header process that will look for the non-standard
+    # "Refresh" header and attempt to follow the redirect
+    url_opener = urllib2.build_opener(RefreshHeaderProcessor())
+    response = url_opener.open(query)
+
+    data = response.read()
+    data_dict = json.loads(data)
+    return data_dict["beans"][0][jmx_property]
+  finally:
+    if response is not None:
+      try:
+        response.close()
+      except:
+        pass

+ 0 - 3
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HBASE/alerts.json

@@ -64,9 +64,6 @@
           "type": "METRIC",
           "type": "METRIC",
           "uri": {
           "uri": {
             "http": "{{hbase-site/hbase.master.info.port}}",
             "http": "{{hbase-site/hbase.master.info.port}}",
-            "https": "{{hbase-site/hbase.master.info.port}}",
-            "https_property": "{{cluster-env/security_enabled}}",
-            "https_property_value": "true",
             "default_port": 60010
             "default_port": 60010
           },
           },
           "reporting": {
           "reporting": {

+ 4 - 4
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json

@@ -109,7 +109,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }
@@ -359,11 +359,11 @@
               "text": "All {2} DataNode(s) are healthy"
               "text": "All {2} DataNode(s) are healthy"
             },
             },
             "warning": {
             "warning": {
-              "text": "Dead/Stale Datanode(s): {0}/{1} out of {2}",
+              "text": "DataNode Health: [Live={2}, Stale={1}, Dead={0}]",
               "value": 1
               "value": 1
             },
             },
             "critical": {
             "critical": {
-              "text": "Dead/Stale Datanode(s): {0}/{1} out of {2}",
+              "text": "DataNode Health: [Live={2}, Stale={1}, Dead={0}]",
               "value": 1
               "value": 1
             },
             },
             "units" : "DNs"
             "units" : "DNs"
@@ -512,7 +512,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }

+ 15 - 5
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py

@@ -129,8 +129,18 @@ def get_time(delta):
   return {'h':h, 'm':m}
   return {'h':h, 'm':m}
 
 
 
 
-def get_value_from_jmx(qry, property):
-  response = urllib2.urlopen(qry)
-  data=response.read()
-  data_dict = json.loads(data)
-  return data_dict["beans"][0][property]
+def get_value_from_jmx(query, jmx_property):
+  response = None
+  
+  try:
+    response = urllib2.urlopen(query)
+    data = response.read()
+
+    data_dict = json.loads(data)
+    return data_dict["beans"][0][jmx_property]
+  finally:
+    if response is not None:
+      try:
+        response.close()
+      except:
+        pass

+ 15 - 5
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py

@@ -159,8 +159,18 @@ def execute(parameters=None, host_name=None):
       return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
       return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
 
 
 
 
-def get_value_from_jmx(qry, property):
-  response = urllib2.urlopen(qry)
-  data=response.read()
-  data_dict = json.loads(data)
-  return data_dict["beans"][0][property]
+def get_value_from_jmx(query, jmx_property):
+  response = None
+  
+  try:
+    response = urllib2.urlopen(query)
+    data = response.read()
+
+    data_dict = json.loads(data)
+    return data_dict["beans"][0][jmx_property]
+  finally:
+    if response is not None:
+      try:
+        response.close()
+      except:
+        pass

+ 1 - 4
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/package/files/alert_hive_thrift_port.py

@@ -18,12 +18,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 limitations under the License.
 """
 """
 
 
-import json
 import socket
 import socket
 import time
 import time
-import traceback
-import urllib2
-from resource_management.libraries.functions import hive_check 
+from resource_management.libraries.functions import hive_check
 
 
 OK_MESSAGE = "TCP OK - %.4f response on port %s"
 OK_MESSAGE = "TCP OK - %.4f response on port %s"
 CRITICAL_MESSAGE = "Connection failed on host {0}:{1}"
 CRITICAL_MESSAGE = "Connection failed on host {0}:{1}"

+ 9 - 2
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py

@@ -96,7 +96,6 @@ def execute(parameters=None, host_name=None):
   if security_enabled:
   if security_enabled:
     if WEBHCAT_KEYTAB_KEY not in parameters or WEBHCAT_PRINCIPAL_KEY not in parameters:
     if WEBHCAT_KEYTAB_KEY not in parameters or WEBHCAT_PRINCIPAL_KEY not in parameters:
       return (RESULT_CODE_UNKNOWN, [str(parameters)])
       return (RESULT_CODE_UNKNOWN, [str(parameters)])
-      # return (RESULT_CODE_UNKNOWN, ['The WebHCat keytab and principal are required parameters when security is enabled.'])
 
 
     try:
     try:
       webhcat_keytab = parameters[WEBHCAT_KEYTAB_KEY]
       webhcat_keytab = parameters[WEBHCAT_KEYTAB_KEY]
@@ -164,6 +163,8 @@ def execute(parameters=None, host_name=None):
     except Exception, exception:
     except Exception, exception:
       return (RESULT_CODE_CRITICAL, [str(exception)])
       return (RESULT_CODE_CRITICAL, [str(exception)])
   else:
   else:
+    url_response = None
+    
     try:
     try:
       # execute the query for the JSON that includes WebHCat status
       # execute the query for the JSON that includes WebHCat status
       start_time = time.time()
       start_time = time.time()
@@ -177,6 +178,12 @@ def execute(parameters=None, host_name=None):
     except:
     except:
       label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
       label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
       return (RESULT_CODE_CRITICAL, [label])
       return (RESULT_CODE_CRITICAL, [label])
+    finally:
+      if url_response is not None:
+        try:
+          url_response.close()
+        except:
+          pass
 
 
 
 
   # if status is not in the response, we can't do any check; return CRIT
   # if status is not in the response, we can't do any check; return CRIT
@@ -199,4 +206,4 @@ def execute(parameters=None, host_name=None):
     result_code = RESULT_CODE_CRITICAL
     result_code = RESULT_CODE_CRITICAL
     label = CRITICAL_WEBHCAT_STATUS_MESSAGE.format(webhcat_status)
     label = CRITICAL_WEBHCAT_STATUS_MESSAGE.format(webhcat_status)
 
 
-  return (result_code, [label])
+  return (result_code, [label])

+ 4 - 4
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/alerts.json

@@ -26,7 +26,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }
@@ -189,7 +189,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }
@@ -237,7 +237,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }
@@ -354,7 +354,7 @@
               "text": "HTTP {0} response in {2:.3f} seconds"
               "text": "HTTP {0} response in {2:.3f} seconds"
             },
             },
             "critical": {
             "critical": {
-              "text": "Connection failed to {1}: {3}"
+              "text": "Connection failed to {1} ({3})"
             }
             }
           }
           }
         }
         }

+ 25 - 5
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py

@@ -21,6 +21,8 @@ limitations under the License.
 import json
 import json
 import socket
 import socket
 import urllib2
 import urllib2
+from ambari_commons import OSCheck
+from ambari_commons.inet_utils import resolve_address
 
 
 RESULT_CODE_OK = 'OK'
 RESULT_CODE_OK = 'OK'
 RESULT_CODE_CRITICAL = 'CRITICAL'
 RESULT_CODE_CRITICAL = 'CRITICAL'
@@ -31,7 +33,8 @@ NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.nodemanager.webapp.https.addre
 YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
 YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
 
 
 OK_MESSAGE = 'NodeManager Healthy'
 OK_MESSAGE = 'NodeManager Healthy'
-CRITICAL_CONNECTION_MESSAGE = 'Connection failed to {0}'
+CRITICAL_CONNECTION_MESSAGE = 'Connection failed to {0} ({1})'
+CRITICAL_HTTP_STATUS_MESSAGE = 'HTTP {0} returned from {1} ({2})'
 CRITICAL_NODEMANAGER_STATUS_MESSAGE = 'NodeManager returned an unexpected status of "{0}"'
 CRITICAL_NODEMANAGER_STATUS_MESSAGE = 'NodeManager returned an unexpected status of "{0}"'
 CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager health from unexpected JSON response'
 CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager health from unexpected JSON response'
 
 
@@ -92,14 +95,25 @@ def execute(parameters=None, host_name=None):
       host_name = socket.getfqdn()
       host_name = socket.getfqdn()
 
 
     uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
     uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
+    
+  if OSCheck.is_windows_family():
+    uri_host, uri_port = uri.split(':')
+    # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
+    uri_host = resolve_address(uri_host)
+    uri = '{0}:{1}'.format(uri_host, uri_port)
+
+  query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)
 
 
   try:
   try:
-    query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)
-    
     # execute the query for the JSON that includes templeton status
     # execute the query for the JSON that includes templeton status
     url_response = urllib2.urlopen(query)
     url_response = urllib2.urlopen(query)
-  except:
-    label = CRITICAL_CONNECTION_MESSAGE.format(uri)
+  except urllib2.HTTPError, httpError:
+    label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
+      str(httpError))
+
+    return (RESULT_CODE_CRITICAL, [label])
+  except Exception, exception:
+    label = CRITICAL_CONNECTION_MESSAGE.format(query, str(exception))
     return (RESULT_CODE_CRITICAL, [label])
     return (RESULT_CODE_CRITICAL, [label])
 
 
   # URL response received, parse it
   # URL response received, parse it
@@ -111,6 +125,12 @@ def execute(parameters=None, host_name=None):
     node_healthy = str(node_healthy)
     node_healthy = str(node_healthy)
   except:
   except:
     return (RESULT_CODE_CRITICAL, [query])
     return (RESULT_CODE_CRITICAL, [query])
+  finally:
+    if url_response is not None:
+      try:
+        url_response.close()
+      except:
+        pass
 
 
   # proper JSON received, compare against known value
   # proper JSON received, compare against known value
   if node_healthy.lower() == 'true':
   if node_healthy.lower() == 'true':