Browse Source

AMBARI-7911 - Alerts: Convert YARN, MapR, and ZK Nagios Alerts Into Ambari (jonathanhurley)

Jonathan Hurley 10 years ago
parent
commit
d2b8f3e96e
21 changed files with 1176 additions and 664 deletions
  1. 30 26
      ambari-agent/src/main/python/ambari_agent/AlertSchedulerHandler.py
  2. 54 38
      ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py
  3. 8 8
      ambari-agent/src/main/python/ambari_agent/alerts/collector.py
  4. 4 4
      ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
  5. 2 3
      ambari-agent/src/main/python/ambari_agent/alerts/port_alert.py
  6. 40 8
      ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
  7. 20 10
      ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py
  8. 7 13
      ambari-agent/src/test/python/ambari_agent/TestAlerts.py
  9. 40 2
      ambari-agent/src/test/python/ambari_agent/dummy_files/test_script.py
  10. 16 10
      ambari-common/src/main/python/resource_management/libraries/functions/get_port_from_url.py
  11. 5 1
      ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertHistoryResourceProvider.java
  12. 4 3
      ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertNoticeResourceProvider.java
  13. 1 1
      ambari-server/src/main/java/org/apache/ambari/server/orm/entities/AlertDefinitionEntity.java
  14. 23 5
      ambari-server/src/main/java/org/apache/ambari/server/state/alert/AlertDefinitionFactory.java
  15. 21 0
      ambari-server/src/main/java/org/apache/ambari/server/state/alert/AlertUri.java
  16. 24 22
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/alerts.json
  17. 401 399
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/alerts.json
  18. 310 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/alerts.json
  19. 51 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/ZOOKEEPER/alerts.json
  20. 1 0
      ambari-server/src/test/java/org/apache/ambari/server/api/services/AmbariMetaInfoTest.java
  21. 114 111
      ambari-server/src/test/resources/stacks/HDP/2.0.5/services/HDFS/alerts.json

+ 30 - 26
ambari-agent/src/main/python/ambari_agent/AlertSchedulerHandler.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-'''
+"""
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
@@ -16,11 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 
-'''
+"""
 http://apscheduler.readthedocs.org/en/v2.1.2
-'''
+"""
 import json
 import logging
 import os
@@ -71,7 +71,7 @@ class AlertSchedulerHandler():
 
 
   def update_definitions(self, alert_commands, reschedule_jobs=False):
-    ''' updates the persisted definitions and restarts the scheduler '''
+    """ updates the persisted definitions and restarts the scheduler """
     
     with open(os.path.join(self.cachedir, self.FILENAME), 'w') as f:
       json.dump(alert_commands, f, indent=2)
@@ -85,7 +85,7 @@ class AlertSchedulerHandler():
 
 
   def start(self):
-    ''' loads definitions from file and starts the scheduler '''
+    """ loads definitions from file and starts the scheduler """
 
     if self.__scheduler is None:
       return
@@ -113,10 +113,10 @@ class AlertSchedulerHandler():
 
 
   def reschedule(self):
-    '''
+    """
     Removes jobs that are scheduled where their UUID no longer is valid. 
     Schedules jobs where the definition UUID is not currently scheduled.
-    '''
+    """
     jobs_scheduled = 0
     jobs_removed = 0
     
@@ -159,12 +159,12 @@ class AlertSchedulerHandler():
 
 
   def collector(self):
-    ''' gets the collector for reporting to the server '''
+    """ gets the collector for reporting to the server """
     return self._collector
   
 
   def __load_definitions(self):
-    ''' loads all alert commands from the file.  all clusters are stored in one file '''
+    """ loads all alert commands from the file.  all clusters are stored in one file """
     definitions = []
     
     all_commands = None
@@ -187,28 +187,28 @@ class AlertSchedulerHandler():
         configmap = command_json['configurations']
 
       for definition in command_json['alertDefinitions']:
-        obj = self.__json_to_callable(clusterName, hostName, definition)
+        alert = self.__json_to_callable(clusterName, hostName, definition)
         
-        if obj is None:
+        if alert is None:
           continue
           
         # get the config values for the alerts 'lookup keys',
         # eg: hdfs-site/dfs.namenode.http-address : host_and_port        
-        vals = self.__find_config_values(configmap, obj.get_lookup_keys())
+        vals = self.__find_config_values(configmap, alert.get_lookup_keys())
         self.__config_maps[clusterName].update(vals)
 
-        obj.set_helpers(self._collector, self.__config_maps[clusterName])
+        alert.set_helpers(self._collector, self.__config_maps[clusterName])
 
-        definitions.append(obj)
+        definitions.append(alert)
       
     return definitions
 
 
   def __json_to_callable(self, clusterName, hostName, json_definition):
-    '''
+    """
     converts the json that represents all aspects of a definition
     and makes an object that extends BaseAlert that is used for individual
-    '''
+    """
     source = json_definition['source']
     source_type = source.get('type', '')
 
@@ -234,7 +234,11 @@ class AlertSchedulerHandler():
 
 
   def __find_config_values(self, configmap, obj_keylist):
-    ''' finds templated values in the configuration map provided  by the server '''
+    """ 
+    finds templated values in the configuration map provided by the server
+    and returns a dictionary of template key to value 
+    """
+    
     if configmap is None:
       return {}
     
@@ -253,10 +257,10 @@ class AlertSchedulerHandler():
 
  
   def update_configurations(self, commands):
-    '''
+    """
     when an execution command comes in, update any necessary values.
     status commands do not contain useful configurations
-    '''
+    """
     for command in commands:
       clusterName = command['clusterName']
       if not clusterName in self.__config_maps:
@@ -270,13 +274,13 @@ class AlertSchedulerHandler():
         
 
   def schedule_definition(self,definition):
-    '''
+    """
     Schedule a definition (callable). Scheduled jobs are given the UUID
     as their name so that they can be identified later on.
     <p/>
     This function can be called with a definition that is disabled; it will
     simply NOOP.
-    '''
+    """
     # NOOP if the definition is disabled; don't schedule it
     if definition.is_enabled() == False:
       logger.info("The alert {0} with UUID {1} is disabled and will not be scheduled".format(
@@ -302,10 +306,10 @@ class AlertSchedulerHandler():
   
 
   def get_job_count(self):
-    '''
+    """
     Gets the number of jobs currently scheduled. This is mainly used for
     test verification of scheduling
-    '''
+    """
     if self.__scheduler is None:
       return 0
     
@@ -313,11 +317,11 @@ class AlertSchedulerHandler():
 
   
   def execute_alert(self, execution_commands):
-    '''
+    """
     Executes an alert immediately, ignoring any scheduled jobs. The existing
     jobs remain untouched. The result of this is stored in the alert
     collector for tranmission during the next heartbeat
-    '''
+    """
     if self.__scheduler is None or execution_commands is None:
       return
 

+ 54 - 38
ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-'''
+"""
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
@@ -16,12 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 
 import logging
 import re
 import time
-import traceback
 from collections import namedtuple
 
 logger = logging.getLogger()
@@ -41,7 +40,7 @@ class BaseAlert(object):
     
     
   def interval(self):
-    ''' gets the defined interval this check should run '''
+    """ gets the defined interval this check should run """
     if not self.alert_meta.has_key('interval'):
       return 1
     else:
@@ -50,40 +49,40 @@ class BaseAlert(object):
 
 
   def is_enabled(self):
-    '''
+    """
     gets whether the definition is enabled
-    '''
+    """
     return self.alert_meta['enabled']
   
 
   def get_name(self):
-    '''
+    """
     gets the unique name of the alert definition
-    '''
+    """
     return self.alert_meta['name']
 
 
   def get_uuid(self):
-    '''
+    """
     gets the unique has of the alert definition
-    '''
+    """
     return self.alert_meta['uuid']
 
 
   def set_helpers(self, collector, value_dict):
-    ''' sets helper objects for alerts without having to use them in a constructor '''
+    """ sets helper objects for alerts without having to use them in a constructor """
     self.collector = collector
     self.config_value_dict = value_dict
 
 
   def set_cluster(self, cluster, host):
-    ''' sets cluster information for the alert '''
+    """ sets cluster information for the alert """
     self.cluster = cluster
     self.host_name = host
 
 
   def collect(self):
-    ''' method used for collection.  defers to _collect() '''
+    """ method used for collection.  defers to _collect() """
     
     res = (BaseAlert.RESULT_UNKNOWN, [])
     res_base_text = "Unknown {0}"
@@ -126,7 +125,7 @@ class BaseAlert(object):
 
 
   def _find_value(self, meta_key):
-    ''' safe way to get a value when outputting result json.  will not throw an exception '''
+    """ safe way to get a value when outputting result json.  will not throw an exception """
     if self.alert_meta.has_key(meta_key):
       return self.alert_meta[meta_key]
     else:
@@ -134,14 +133,15 @@ class BaseAlert(object):
 
 
   def get_lookup_keys(self):
-    ''' returns a list of lookup keys found for this alert '''
+    """ returns a list of lookup keys found for this alert """
     return self._lookup_keys
 
 
   def _find_lookup_property(self, key):
-    '''
-    check if the supplied key is parameterized
-    '''
+    """
+    check if the supplied key is parameterized and appends the extracted key
+    to the array of keys
+    """
     keys = re.findall("{{([\S]+)}}", key)
     
     if len(keys) > 0:
@@ -153,9 +153,9 @@ class BaseAlert(object):
 
 
   def _lookup_property_value(self, key):
-    '''
+    """
     in the case of specifying a configuration path, lookup that path's value
-    '''
+    """
     if not key in self._lookup_keys:
       return key
 
@@ -166,7 +166,7 @@ class BaseAlert(object):
 
     
   def _lookup_uri_property_keys(self, uri_structure):
-    '''
+    """
     Loads the configuration lookup keys that the URI structure needs. This
     will return a named tuple that contains the keys needed to lookup
     parameterized URI values from the URI structure. The URI structure looks 
@@ -177,7 +177,7 @@ class BaseAlert(object):
       "https": bar,
       ...
     }
-    '''
+    """
     
     if uri_structure is None:
       return None
@@ -186,6 +186,7 @@ class BaseAlert(object):
     https_key = None
     https_property_key = None
     https_property_value_key = None
+    default_port = None
     
     if 'http' in uri_structure:
       http_key = self._find_lookup_property(uri_structure['http'])
@@ -199,17 +200,21 @@ class BaseAlert(object):
     if 'https_property_value' in uri_structure:
       https_property_value_key = uri_structure['https_property_value']
 
+    if 'default_port' in uri_structure:
+      default_port = uri_structure['default_port']
+
     AlertUriLookupKeys = namedtuple('AlertUriLookupKeys', 
-        'http https https_property https_property_value')
+        'http https https_property https_property_value default_port')
     
     alert_uri_lookup_keys = AlertUriLookupKeys(http=http_key, https=https_key, 
-        https_property=https_property_key, https_property_value=https_property_value_key)
+        https_property=https_property_key, 
+        https_property_value=https_property_value_key, default_port=default_port)
     
     return alert_uri_lookup_keys
 
     
   def _get_uri_from_structure(self, alert_uri_lookup_keys):
-    '''
+    """
     Gets the URI to use by examining the URI structure from the definition.
     This will return a named tuple that has the uri and the SSL flag. The
     URI structure looks something like:
@@ -219,7 +224,7 @@ class BaseAlert(object):
       "https": bar,
       ...
     }
-    '''
+    """
     
     if alert_uri_lookup_keys is None:
       return None
@@ -229,6 +234,9 @@ class BaseAlert(object):
     https_property = None
     https_property_value = None
 
+    # create a named tuple to return both the concrete URI and SSL flag
+    AlertUri = namedtuple('AlertUri', 'uri is_ssl_enabled')
+    
     # attempt to parse and parameterize the various URIs; properties that
     # do not exist int he lookup map are returned as None
     if alert_uri_lookup_keys.http is not None:
@@ -243,9 +251,14 @@ class BaseAlert(object):
     if alert_uri_lookup_keys.https_property_value is not None:
       https_property_value = self._lookup_property_value(alert_uri_lookup_keys.https_property_value)
 
-    # without a URI, there's no way to create the structure we need    
+    # without a URI, there's no way to create the structure we need - return
+    # the default port if specified, otherwise throw an exception
     if http_uri is None and https_uri is None:
-      raise Exception("Could not determine result. Either the http or https URI must be specified.")
+      if alert_uri_lookup_keys.default_port is not None:
+        alert_uri = AlertUri(uri=alert_uri_lookup_keys.default_port, is_ssl_enabled=False)
+        return alert_uri
+      else:
+        raise Exception("Could not determine result. Either the http or https URI must be specified.")
 
     # start out assuming plaintext
     uri = http_uri
@@ -260,22 +273,19 @@ class BaseAlert(object):
         is_ssl_enabled = True
         uri = https_uri
     
-    # create a named tuple to return both the concrete URI and SSL flag
-    AlertUri = namedtuple('AlertUri', 'uri is_ssl_enabled')
     alert_uri = AlertUri(uri=uri, is_ssl_enabled=is_ssl_enabled)
-    
     return alert_uri
 
 
   def _collect(self):
-    '''
+    """
     Low level function to collect alert data.  The result is a tuple as:
     res[0] = the result code
     res[1] = the list of arguments supplied to the reporting text for the result code
-    '''  
+    """  
     raise NotImplementedError
 
-  '''
+  """
   See RFC3986, Appendix B
   Tested on the following cases:
     "192.168.54.1"
@@ -284,9 +294,16 @@ class BaseAlert(object):
     "ftp://192.168.54.4:7842/foo/bar"
 
     Returns None if only a port is passsed in
-  '''
+  """
   @staticmethod
   def get_host_from_url(uri):
+    if uri is None:
+      return None
+    
+    # if not a string, return None
+    if not isinstance(uri, basestring):
+      return None    
+        
     # RFC3986, Appendix B
     parts = re.findall('^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?', uri)
 
@@ -306,10 +323,9 @@ class BaseAlert(object):
       host_and_port = parts[0][3]
 
     if -1 == host_and_port.find(':'):
-      # if no : then it might only be a port; if it's a port, return this host
       if host_and_port.isdigit():
-        return None
-
+        return None    
+      
       return host_and_port
     else:
       return host_and_port.split(':')[0]

+ 8 - 8
ambari-agent/src/main/python/ambari_agent/alerts/collector.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-'''
+"""
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
@@ -16,16 +16,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 
 import logging
 
 logger = logging.getLogger()
 
 class AlertCollector():
-  '''
+  """
   cluster -> name -> alert dict
-  '''  
+  """  
   def __init__(self):
     self.__buckets = {}
 
@@ -38,9 +38,9 @@ class AlertCollector():
 
 
   def remove(self, cluster, alert_name):
-    '''
+    """
     Removes the alert with the specified name if it exists in the dictionary
-    '''
+    """
     if not cluster in self.__buckets:
       return
     
@@ -48,9 +48,9 @@ class AlertCollector():
 
 
   def remove_by_uuid(self, alert_uuid):
-    '''
+    """
     Removes the alert with the specified uuid if it exists in the dictionary
-    '''
+    """
     for cluster,alert_map in self.__buckets.iteritems():
       for alert_name in alert_map.keys():
         alert = alert_map[alert_name]

+ 4 - 4
ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-'''
+"""
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
@@ -16,7 +16,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 
 import imp
 import json
@@ -121,7 +121,7 @@ class MetricAlert(BaseAlert):
 
     
   def __find_threshold(self, reporting_type):
-    ''' find the defined thresholds for alert values '''
+    """ find the defined thresholds for alert values """
     
     if not 'reporting' in self.alert_source_meta:
       return None
@@ -136,7 +136,7 @@ class MetricAlert(BaseAlert):
 
     
   def _load_jmx(self, ssl, host, port, jmx_metric):
-    ''' creates a JmxMetric object that holds info about jmx-based metrics '''
+    """ creates a JmxMetric object that holds info about jmx-based metrics """
     
     logger.debug(str(jmx_metric.property_map))
     

+ 2 - 3
ambari-agent/src/main/python/ambari_agent/alerts/port_alert.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-'''
+"""
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
@@ -16,10 +16,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 
 import logging
-import re
 import socket
 import time
 from alerts.base_alert import BaseAlert

+ 40 - 8
ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-'''
+"""
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
@@ -16,18 +16,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 
 import imp
 import logging
 import os
 from alerts.base_alert import BaseAlert
+from symbol import parameters
 
 logger = logging.getLogger()
 
 class ScriptAlert(BaseAlert):
   def __init__(self, alert_meta, alert_source_meta):
-    ''' ScriptAlert reporting structure is output from the script itself '''
+    """ ScriptAlert reporting structure is output from the script itself """
     
     alert_source_meta['reporting'] = {
       'ok': { 'text': '{0}' },
@@ -44,8 +45,39 @@ class ScriptAlert(BaseAlert):
       
     if 'stacks_dir' in alert_source_meta:
       self.stacks_dir = alert_source_meta['stacks_dir']
+      
+    # execute the get_tokens() method so that this script correctly populates
+    # its list of keys
+    try:
+      cmd_module = self._load_source()
+      tokens = cmd_module.get_tokens()
+        
+      # for every token, populate the array keys that this alert will need
+      if tokens is not None:
+        for token in tokens:
+          # append the key to the list of keys for this alert
+          self._find_lookup_property(token)
+    except:
+      logger.exception("Unable to parameterize tokens for script {0}".format(self.path))
+      pass
+              
     
   def _collect(self):
+    cmd_module = self._load_source()
+    if cmd_module is not None:
+      # convert the dictionary from 
+      # {'foo-site/bar': 'baz'} into 
+      # {'{{foo-site/bar}}': 'baz'}1
+      parameters = {}
+      for key in self.config_value_dict:
+        parameters['{{' + key + '}}'] = self.config_value_dict[key]
+      
+      return cmd_module.execute(parameters)
+    else:
+      return ((self.RESULT_UNKNOWN, ["Unable to execute script {0}".format(self.path)]))
+    
+
+  def _load_source(self):
     if self.path is None and self.stack_path is None:
       raise Exception("The attribute 'path' must be specified")
 
@@ -63,8 +95,8 @@ class ScriptAlert(BaseAlert):
       logger.debug("Executing script check {0}".format(path_to_script))
 
           
-    if (path_to_script.endswith('.py')):
-      cmd_module = imp.load_source(self._find_value('name'), path_to_script)
-      return cmd_module.execute()
-    else:
-      return ((self.RESULT_UNKNOWN, ["could not execute script {0}".format(path_to_script)]))
+    if (not path_to_script.endswith('.py')):
+      logger.error("Unable to execute script {0}".format(path_to_script))
+      return None
+    
+    return imp.load_source(self._find_value('name'), path_to_script)

+ 20 - 10
ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-'''
+"""
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
@@ -16,11 +16,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 
 import logging
+import time
 import urllib2
 from alerts.base_alert import BaseAlert
+from collections import namedtuple
 from resource_management.libraries.functions.get_port_from_url import get_port_from_url
 
 logger = logging.getLogger()
@@ -58,33 +60,41 @@ class WebAlert(BaseAlert):
     except:
       pass
 
-    status_code = self._make_web_request(host, port, alert_uri.is_ssl_enabled)
+    web_response = self._make_web_request(host, port, alert_uri.is_ssl_enabled)
+    status_code = web_response.status_code
+    time_seconds = web_response.time_millis / 1000
 
     if status_code == 0:
-      return (self.RESULT_CRITICAL, [status_code, host, port])
+      return (self.RESULT_CRITICAL, [status_code, host, port, time_seconds])
     
     if status_code <= 401:
-      return (self.RESULT_OK, [status_code, host, port])
+      return (self.RESULT_OK, [status_code, host, port, time_seconds])
     
-    return (self.RESULT_WARNING, [status_code, host, port])
+    return (self.RESULT_WARNING, [status_code, host, port, time_seconds])
 
 
   def _make_web_request(self, host, port, ssl):
-    '''
+    """
     Makes an http(s) request to a web resource and returns the http code. If
     there was an error making the request, return 0 for the status code.
-    '''    
+    """    
     url = "{0}://{1}:{2}".format(
         "https" if ssl else "http", host, str(port))
     
+    WebResponse = namedtuple('WebResponse', 'status_code time_millis')
+    
+    time_millis = 0
+    
     try:
+      start_time = time.time()      
       response = urllib2.urlopen(url)
+      time_millis = time.time() - start_time
     except:
       if logger.isEnabledFor(logging.DEBUG):
         logger.exception("Unable to make a web request.")
       
-      return 0
+      return WebResponse(status_code=0, time_millis=0)
     
-    return response.getcode()
+    return WebResponse(status_code=response.getcode(), time_millis=time_millis) 
   
   

+ 7 - 13
ambari-agent/src/test/python/ambari_agent/TestAlerts.py

@@ -28,6 +28,7 @@ from ambari_agent.alerts.script_alert import ScriptAlert
 from ambari_agent.alerts.web_alert import WebAlert
 from ambari_agent.apscheduler.scheduler import Scheduler
 
+from collections import namedtuple
 from mock.mock import patch
 from unittest import TestCase
 
@@ -131,14 +132,6 @@ class TestAlerts(TestCase):
       "source": {
         "type": "SCRIPT",
         "path": "test_script.py",
-        "reporting": {
-          "ok": {
-            "text": "TCP OK - {0:.4f} response time on port {1}"
-          },
-          "critical": {
-            "text": "Could not load process info: {0}"
-          }
-        }
       }
     }
 
@@ -147,14 +140,14 @@ class TestAlerts(TestCase):
 
     collector = AlertCollector()
     sa = ScriptAlert(json, json['source'])
-    sa.set_helpers(collector, '')
+    sa.set_helpers(collector, {'foo-site/bar': 'rendered-bar', 'foo-site/baz':'rendered-baz'} )
     self.assertEquals(json['source']['path'], sa.path)
     self.assertEquals(json['source']['stacks_dir'], sa.stacks_dir)
 
     sa.collect()
 
     self.assertEquals('WARNING', collector.alerts()[0]['state'])
-    self.assertEquals('all is not well', collector.alerts()[0]['text'])
+    self.assertEquals('bar is rendered-bar, baz is rendered-baz', collector.alerts()[0]['text'])
 
 
   @patch.object(MetricAlert, "_load_jmx")
@@ -339,7 +332,8 @@ class TestAlerts(TestCase):
       }
     }
 
-    wa_make_web_request_mock.return_value = 200
+    WebResponse = namedtuple('WebResponse', 'status_code time_millis')
+    wa_make_web_request_mock.return_value = WebResponse(200,1.234)
 
     # run the alert and check HTTP 200    
     collector = AlertCollector()
@@ -351,7 +345,7 @@ class TestAlerts(TestCase):
     self.assertEquals('ok: 200', collector.alerts()[0]['text'])
 
     # run the alert and check HTTP 500
-    wa_make_web_request_mock.return_value = 500
+    wa_make_web_request_mock.return_value = WebResponse(500,1.234)
     collector = AlertCollector()
     alert = WebAlert(json, json['source'])
     alert.set_helpers(collector, {'hdfs-site/dfs.datanode.http.address': '1.2.3.4:80'})
@@ -361,7 +355,7 @@ class TestAlerts(TestCase):
     self.assertEquals('warning: 500', collector.alerts()[0]['text'])
 
     # run the alert and check critical
-    wa_make_web_request_mock.return_value = 0
+    wa_make_web_request_mock.return_value = WebResponse(0,0)
      
     collector = AlertCollector()
     alert = WebAlert(json, json['source'])

+ 40 - 2
ambari-agent/src/test/python/ambari_agent/dummy_files/test_script.py

@@ -1,3 +1,41 @@
+#!/usr/bin/env python
 
-def execute(params=None):
-  return (('WARNING', ['all is not well', str(params)]))
+'''
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+def get_tokens():
+  '''
+  return a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  '''
+  return ('{{foo-site/bar}}','{{foo-site/baz}}')
+  
+
+def execute(parameters=None):
+  '''
+  returns a tuple containing the result code and a pre-formatted result label
+  '''
+  if parameters is not None:
+    if '{{foo-site/bar}}' in parameters:
+      bar = parameters['{{foo-site/bar}}']
+    
+    if '{{foo-site/baz}}' in parameters:
+      baz = parameters['{{foo-site/baz}}']
+  
+  label = "bar is {0}, baz is {1}".format(bar, baz)  
+  return (('WARNING', [label]))

+ 16 - 10
ambari-common/src/main/python/resource_management/libraries/functions/get_port_from_url.py

@@ -27,14 +27,20 @@ import re
 
 def get_port_from_url(address):
   """
-  Return port from URL. If address is UnknownConfiguration,
-  UnknownConfiguration will be returned. If no port was found, Fail will be
-  raised.
+  Return port from URL. If the address is numeric, the address is assumed to be a port and is returned.
+  If address is UnknownConfiguration, UnknownConfiguration will be returned. 
+  If no port was found, Fail will be raised.
   """
-  if not is_empty(address):
-    port = re.findall(":([\d]{1,5})(?=/|$)", address)
-    if port:
-      return port[0]
-    raise Fail("No port in URL:{0}".format(address))
-  else:
-    return address
+  if is_empty(address):
+    return address
+  
+  if isinstance(address, (int, long)):
+    return address  
+  
+  port = re.findall(":([\d]{1,5})(?=/|$)", address)
+  if port:
+    return port[0]
+  elif address.isdigit():
+    return address
+
+  raise Fail("No port in URL:{0}".format(address))

+ 5 - 1
ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertHistoryResourceProvider.java

@@ -214,7 +214,11 @@ public class AlertHistoryResourceProvider extends AbstractResourceProvider {
     Resource resource = new ResourceImpl(Resource.Type.AlertHistory);
     resource.setProperty(ALERT_HISTORY_ID, entity.getAlertId());
 
-    setResourceProperty(resource, ALERT_HISTORY_CLUSTER_NAME,cluster.getClusterName(), requestedIds);
+    if (null != cluster) {
+      setResourceProperty(resource, ALERT_HISTORY_CLUSTER_NAME,
+          cluster.getClusterName(), requestedIds);
+    }
+
     setResourceProperty(resource, ALERT_HISTORY_DEFINITION_ID, definition.getDefinitionId(), requestedIds);
     setResourceProperty(resource, ALERT_HISTORY_DEFINITION_NAME, definition.getDefinitionName(), requestedIds);
     setResourceProperty(resource, ALERT_HISTORY_SERVICE_NAME, entity.getServiceName(), requestedIds);

+ 4 - 3
ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertNoticeResourceProvider.java

@@ -228,9 +228,10 @@ public class AlertNoticeResourceProvider extends AbstractResourceProvider {
     setResourceProperty(resource, ALERT_NOTICE_HISTORY_ID,
         history.getAlertId(), requestedIds);
 
-    setResourceProperty(resource, ALERT_NOTICE_CLUSTER_NAME,
-        cluster.getClusterName(),
-        requestedIds);
+    if (null != cluster) {
+      setResourceProperty(resource, ALERT_NOTICE_CLUSTER_NAME,
+          cluster.getClusterName(), requestedIds);
+    }
 
     return resource;
   }

+ 1 - 1
ambari-server/src/main/java/org/apache/ambari/server/orm/entities/AlertDefinitionEntity.java

@@ -79,7 +79,7 @@ public class AlertDefinitionEntity {
   @Column(name = "cluster_id", nullable = false)
   private Long clusterId;
 
-  @ManyToOne(fetch = FetchType.LAZY)
+  @ManyToOne(fetch = FetchType.EAGER)
   @JoinColumn(name = "cluster_id", referencedColumnName = "cluster_id", insertable = false, updatable = false)
   private ClusterEntity clusterEntity;
 

+ 23 - 5
ambari-server/src/main/java/org/apache/ambari/server/state/alert/AlertDefinitionFactory.java

@@ -75,28 +75,46 @@ public class AlertDefinitionFactory {
 
   /**
    * Gets a list of all of the alert definitions defined in the specified JSON
-   * {@link File} for the given service.
+   * {@link File} for the given service. Each of the JSON files should have a
+   * mapping between the service and the alerts defined for that service. This
+   * is necessary since some services are combined in a single
+   * {@code metainfo.xml} and only have a single directory on the stack.
    *
    * @param alertDefinitionFile
+   *          the JSON file from the stack to read (not {@code null}).
    * @param serviceName
-   * @return
+   *          the name of the service to extract definitions for (not
+   *          {@code null}).
+   * @return the definitions for the specified service, or an empty set.
    * @throws AmbariException
    *           if there was a problem reading the file or parsing the JSON.
    */
   public Set<AlertDefinition> getAlertDefinitions(File alertDefinitionFile,
       String serviceName) throws AmbariException {
-    Map<String,List<AlertDefinition>> definitionMap = null;
+
+    // { MAPR : {definitions}, YARN : {definitions} }
+    Map<String, Map<String, List<AlertDefinition>>> serviceDefinitionMap = null;
 
     try {
-      Type type = new TypeToken<Map<String, List<AlertDefinition>>>(){}.getType();
+      Type type = new TypeToken<Map<String, Map<String, List<AlertDefinition>>>>() {}.getType();
 
-      definitionMap = m_gson.fromJson(new FileReader(alertDefinitionFile), type);
+      FileReader fileReader = new FileReader(alertDefinitionFile);
+      serviceDefinitionMap = m_gson.fromJson(fileReader, type);
     } catch (Exception e) {
       LOG.error("Could not read the alert definition file", e);
       throw new AmbariException("Could not read alert definition file", e);
     }
 
     Set<AlertDefinition> definitions = new HashSet<AlertDefinition>();
+
+    // it's OK if the service doesn't have any definitions; this can happen if
+    // 2 services are defined in a single metainfo.xml and only 1 service has
+    // alerts defined
+    Map<String, List<AlertDefinition>> definitionMap = serviceDefinitionMap.get(serviceName);
+    if (null == definitionMap) {
+      return definitions;
+    }
+
     for (Entry<String, List<AlertDefinition>> entry : definitionMap.entrySet()) {
       for (AlertDefinition ad : entry.getValue()) {
         ad.setServiceName(serviceName);

+ 21 - 0
ambari-server/src/main/java/org/apache/ambari/server/state/alert/AlertUri.java

@@ -55,6 +55,13 @@ public class AlertUri {
   @SerializedName("https_property_value")
   private String m_httpsPropertyValue;
 
+  /**
+   * A default port to use on the host running the alert if no URLs can be
+   * found.
+   */
+  @SerializedName("default_port")
+  private int m_port = 0;
+
   /**
    * Gets the plaintext (HTTP) URI that can be used to retrieve alert
    * information.
@@ -65,6 +72,16 @@ public class AlertUri {
     return m_httpUri;
   }
 
+  /**
+   * Gets the default port to use on the host running the alert if none of the
+   * http properties are available.
+   *
+   * @return the default port if none of the http properties are found.
+   */
+  public int getDefaultPort() {
+    return m_port;
+  }
+
   /**
    * Gets the secure (HTTPS) URI that can be used to retrieve alert information.
    *
@@ -117,6 +134,7 @@ public class AlertUri {
     result = prime * result
         + ((m_httpsUri == null) ? 0 : m_httpsUri.hashCode());
 
+    result = prime * result + m_port;
     return result;
   }
 
@@ -170,6 +188,9 @@ public class AlertUri {
       return false;
     }
 
+    if (m_port != other.m_port) {
+      return false;
+    }
     return true;
   }
 }

+ 24 - 22
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/alerts.json

@@ -1,24 +1,26 @@
 {
-  "service": [],
-  "HBASE_MASTER": [
-    {
-      "name": "hbase_master_process",
-      "label": "HBase Master Process",
-      "interval": 1,
-      "scope": "ANY",
-      "source": {
-        "type": "PORT",
-        "uri": "{{hbase-site/hbase.master.port}}",
-        "default_port": 60000,
-        "reporting": {
-          "ok": {
-            "text": "TCP OK - {0:.4f} response on port {1}"
-          },
-          "critical": {
-            "text": "Connection failed: {0} on host {1}:{2}"
-          }
-        }        
+  "HBASE": {
+    "service": [],
+    "HBASE_MASTER": [
+      {
+        "name": "hbase_master_process",
+        "label": "HBase Master Process",
+        "interval": 1,
+        "scope": "ANY",
+        "source": {
+          "type": "PORT",
+          "uri": "{{hbase-site/hbase.master.port}}",
+          "default_port": 60000,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.4f} response on port {1}"
+            },
+            "critical": {
+              "text": "Connection failed: {0} on host {1}:{2}"
+            }
+          }        
+        }
       }
-    }
-  ]
-}
+    ]
+  }
+}

+ 401 - 399
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/alerts.json

@@ -1,433 +1,435 @@
 {
-  "service": [
-    {
-      "name": "datanode_process_percent",
-      "label": "Percent DataNodes Available",
-      "interval": 1,
-      "scope": "SERVICE",
-      "enabled": true,
-      "source": {
-        "type": "AGGREGATE",
-        "alert_name": "datanode_process",
-        "reporting": {
-          "ok": {
-            "text": "OK: total: <{0}>, affected: <{1}>"
-          },
-          "warning": {
-            "text": "OK: total: <{0}>, affected: <{1}>",
-            "value": 0.1
-          },
-          "critical": {
-            "text": "CRITICAL: total: <{0}>, affected <{1}>",
-            "value": 0.3
+  "HDFS":{
+    "service": [
+      {
+        "name": "datanode_process_percent",
+        "label": "Percent DataNodes Available",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "AGGREGATE",
+          "alert_name": "datanode_process",
+          "reporting": {
+            "ok": {
+              "text": "affected: [{1}], total: [{0}]"
+            },
+            "warning": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.1
+            },
+            "critical": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.3
+            }
           }
         }
-      }
-    },
-    {
-      "name": "datanode_storage_percent",
-      "label": "Percent DataNodes With Available Space",
-      "interval": 1,
-      "scope": "SERVICE",
-      "enabled": true,
-      "source": {
-        "type": "AGGREGATE",
-        "alert_name": "datanode_storage",
-        "reporting": {
-          "ok": {
-            "text": "OK: total: <{0}>, affected: <{1}>"
-          },
-          "warning": {
-            "text": "OK: total: <{0}>, affected: <{1}>",
-            "value": 0.1
-          },
-          "critical": {
-            "text": "CRITICAL: total: <{0}>, affected <{1}>",
-            "value": 0.3
+      },
+      {
+        "name": "datanode_storage_percent",
+        "label": "Percent DataNodes With Available Space",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "AGGREGATE",
+          "alert_name": "datanode_storage",
+          "reporting": {
+            "ok": {
+              "text": "affected: [{1}], total: [{0}]"
+            },
+            "warning": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.1
+            },
+            "critical": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.3
+            }
           }
         }
-      }
-    },
-    {
-      "name": "journalnode_process_percent",
-      "label": "Percent JournalNodes Available",
-      "interval": 1,
-      "scope": "SERVICE",
-      "enabled": true,
-      "source": {
-        "type": "AGGREGATE",
-        "alert_name": "journalnode_process",
-        "reporting": {
-          "ok": {
-            "text": "OK: total: <{0}>, affected: <{1}>"
-          },
-          "warning": {
-            "text": "OK: total: <{0}>, affected: <{1}>",
-            "value": 0.33
-          },
-          "critical": {
-            "text": "CRITICAL: total: <{0}>, affected <{1}>",
-            "value": 0.50
+      },
+      {
+        "name": "journalnode_process_percent",
+        "label": "Percent JournalNodes Available",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "AGGREGATE",
+          "alert_name": "journalnode_process",
+          "reporting": {
+            "ok": {
+              "text": "affected: [{1}], total: [{0}]"
+            },
+            "warning": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.33
+            },
+            "critical": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.50
+            }
           }
         }
       }
-    }
-  ],
-  "NAMENODE": [
-    {
-      "name": "namenode_webui",
-      "label": "NameNode Web UI",
-      "interval": 1,
-      "scope": "ANY",
-      "enabled": true,
-      "source": {
-        "type": "WEB",
-        "uri": {
-          "http": "{{hdfs-site/dfs.namenode.http-address}}",
-          "https": "{{hdfs-site/dfs.namenode.https-address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "The UI returned a response code of {0}"
+    ],
+    "NAMENODE": [
+      {
+        "name": "namenode_webui",
+        "label": "NameNode Web UI",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "WEB",
+          "uri": {
+            "http": "{{hdfs-site/dfs.namenode.http-address}}",
+            "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
           },
-          "warning":{
-            "text": "The UI returned a response code of {0}"
-          },
-          "critical": {
-            "text": "Connection failed to {1}:{2}"
-          }
-        }        
-      }
-    },
-    {
-      "name": "namenode_cpu",
-      "label": "NameNode Host CPU Utilization",
-      "interval": 5,
-      "scope": "ANY",
-      "enabled": true,
-      "source": {
-        "type": "METRIC",
-        "uri": {
-          "http": "{{hdfs-site/dfs.namenode.http-address}}",
-          "https": "{{hdfs-site/dfs.namenode.https-address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "{1} CPU, load {0:.1%}"
+          "reporting": {
+            "ok": {
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "warning":{
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "critical": {
+              "text": "Connection failed to {1}:{2}"
+            }
+          }        
+        }
+      },
+      {
+        "name": "namenode_cpu",
+        "label": "NameNode Host CPU Utilization",
+        "interval": 5,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.namenode.http-address}}",
+            "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
           },
-          "warning": {
-            "text": "{1} CPU, load {0:.1%}",
-            "value": 200
+          "reporting": {
+            "ok": {
+              "text": "{1} CPU, load {0:.1%}"
+            },
+            "warning": {
+              "text": "{1} CPU, load {0:.1%}",
+              "value": 200
+            },
+            "critical": {
+              "text": "{1} CPU, load {0:.1%}",
+              "value": 250
+            }
           },
-          "critical": {
-            "text": "{1} CPU, load {0:.1%}",
-            "value": 250
+          "jmx": {
+            "property_list": [
+              "java.lang:type=OperatingSystem/SystemCpuLoad",
+              "java.lang:type=OperatingSystem/AvailableProcessors"
+            ],
+            "value": "{0} * 100"
           }
-        },
-        "jmx": {
-          "property_list": [
-            "java.lang:type=OperatingSystem/SystemCpuLoad",
-            "java.lang:type=OperatingSystem/AvailableProcessors"
-          ],
-          "value": "{0} * 100"
         }
-      }
-    },
-    {
-      "name": "namenode_hdfs_blocks_health",
-      "label": "NameNode Blocks Health",
-      "interval": 2,
-      "scope": "ANY",
-      "enabled": true,
-      "source": {
-        "type": "METRIC",
-        "uri": {
-          "http": "{{hdfs-site/dfs.namenode.http-address}}",
-          "https": "{{hdfs-site/dfs.namenode.https-address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "Total Blocks:[{1}], Missing Blocks:[{0}]"
+      },
+      {
+        "name": "namenode_hdfs_blocks_health",
+        "label": "NameNode Blocks Health",
+        "interval": 2,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.namenode.http-address}}",
+            "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
           },
-          "warning": {
-            "text": "Total Blocks:[{1}], Missing Blocks:[{0}]",
-            "value": 1
-          },          
-          "critical": {
-            "text": "Total Blocks:[{1}], Missing Blocks:[{0}]",
-            "value": 1
-          }
-        },
-        "jmx": {
-          "property_list": [
-            "Hadoop:service=NameNode,name=FSNamesystem/MissingBlocks",
-            "Hadoop:service=NameNode,name=FSNamesystem/BlocksTotal"
-          ],
-          "value": "{0}"
-        }
-      }
-    },
-    {
-      "name": "namenode_hdfs_capacity_utilization",
-      "label": "HDFS Capacity Utilization",
-      "interval": 2,
-      "scope": "ANY",
-      "enabled": true,
-      "source": {
-        "type": "METRIC",
-        "uri": {
-          "http": "{{hdfs-site/dfs.namenode.http-address}}",
-          "https": "{{hdfs-site/dfs.namenode.https-address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]"
+          "reporting": {
+            "ok": {
+              "text": "Total Blocks:[{1}], Missing Blocks:[{0}]"
+            },
+            "warning": {
+              "text": "Total Blocks:[{1}], Missing Blocks:[{0}]",
+              "value": 1
+            },          
+            "critical": {
+              "text": "Total Blocks:[{1}], Missing Blocks:[{0}]",
+              "value": 1
+            }
           },
-          "warning": {
-            "text": "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]",
-            "value": 80
-          },          
-          "critical": {
-            "text": "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]",
-            "value": 90
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=NameNode,name=FSNamesystem/MissingBlocks",
+              "Hadoop:service=NameNode,name=FSNamesystem/BlocksTotal"
+            ],
+            "value": "{0}"
           }
-        },
-        "jmx": {
-          "property_list": [
-            "Hadoop:service=NameNode,name=FSNamesystemState/CapacityUsed",
-            "Hadoop:service=NameNode,name=FSNamesystemState/CapacityRemaining"
-          ],
-          "value": "{0}/({0} + {1}) * 100"
         }
-      }
-    },
-    {
-      "name": "namenode_rpc_latency",
-      "label": "NameNode RPC Latency",
-      "interval": 2,
-      "scope": "ANY",
-      "enabled": true,
-      "source": {
-        "type": "METRIC",
-        "uri": {
-          "http": "{{hdfs-site/dfs.namenode.http-address}}",
-          "https": "{{hdfs-site/dfs.namenode.https-address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]"
+      },
+      {
+        "name": "namenode_hdfs_capacity_utilization",
+        "label": "HDFS Capacity Utilization",
+        "interval": 2,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.namenode.http-address}}",
+            "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
           },
-          "warning": {
-            "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
-            "value": 3000
-          },          
-          "critical": {
-            "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
-            "value": 5000
+          "reporting": {
+            "ok": {
+              "text": "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]"
+            },
+            "warning": {
+              "text": "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]",
+              "value": 80
+            },          
+            "critical": {
+              "text": "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]",
+              "value": 90
+            }
+          },
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=NameNode,name=FSNamesystemState/CapacityUsed",
+              "Hadoop:service=NameNode,name=FSNamesystemState/CapacityRemaining"
+            ],
+            "value": "{0}/({0} + {1}) * 100"
           }
-        },
-        "jmx": {
-          "property_list": [
-            "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
-            "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
-          ],
-          "value": "{0}"
         }
-      }
-    },
-    {
-      "name": "namenode_directory_status",
-      "label": "NameNode Directory Status",
-      "interval": 1,
-      "scope": "ANY",
-      "enabled": true,
-      "source": {
-        "type": "METRIC",
-        "uri": {
-          "http": "{{hdfs-site/dfs.namenode.http-address}}",
-          "https": "{{hdfs-site/dfs.namenode.https-address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "Directories are healthy"
+      },
+      {
+        "name": "namenode_rpc_latency",
+        "label": "NameNode RPC Latency",
+        "interval": 2,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.namenode.http-address}}",
+            "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
           },
-          "warning": {
-            "text": "Failed directory count: {1}",
-            "value": 1
-          },          
-          "critical": {
-            "text": "Failed directory count: {1}",
-            "value": 1
+          "reporting": {
+            "ok": {
+              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]"
+            },
+            "warning": {
+              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
+              "value": 3000
+            },          
+            "critical": {
+              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
+              "value": 5000
+            }
+          },
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
+              "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
+            ],
+            "value": "{0}"
           }
-        },
-        "jmx": {
-          "property_list": [
-            "Hadoop:service=NameNode,name=NameNodeInfo/NameDirStatuses"
-          ],
-          "value": "calculate(args)\ndef calculate(args):\n  import json\n  json_statuses = json.loads({0})\n  return len(json_statuses['failed']) if 'failed' in json_statuses else 0"
         }
-      }
-    },
-    {
-      "name": "namenode_process",
-      "label": "NameNode Process",
-      "interval": 1,
-      "scope": "ANY",
-      "enabled": true,
-      "source": {
-        "type": "PORT",        
-        "uri": "{{hdfs-site/dfs.namenode.http-address}}",
-        "default_port": 50070,
-        "reporting": {
-          "ok": {
-            "text": "TCP OK - {0:.4f} response on port {1}"
+      },
+      {
+        "name": "namenode_directory_status",
+        "label": "NameNode Directory Status",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.namenode.http-address}}",
+            "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
           },
-          "critical": {
-            "text": "Connection failed: {0} on host {1}:{2}"
-          }
-        }        
-      }
-    }    
-  ],
-  "SECONDARY_NAMENODE": [
-    {
-      "name": "secondary_namenode_process",
-      "label": "Secondary NameNode Process",
-      "interval": 1,
-      "scope": "ANY",
-      "enabled": true,
-      "source": {
-        "type": "PORT",        
-        "uri": "{{hdfs-site/dfs.namenode.secondary.http-address}}",
-        "default_port": 50071,
-        "reporting": {
-          "ok": {
-            "text": "TCP OK - {0:.4f} response on port {1}"
+          "reporting": {
+            "ok": {
+              "text": "Directories are healthy"
+            },
+            "warning": {
+              "text": "Failed directory count: {1}",
+              "value": 1
+            },          
+            "critical": {
+              "text": "Failed directory count: {1}",
+              "value": 1
+            }
           },
-          "critical": {
-            "text": "Connection failed: {0} on host {1}:{2}"
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=NameNode,name=NameNodeInfo/NameDirStatuses"
+            ],
+            "value": "calculate(args)\ndef calculate(args):\n  import json\n  json_statuses = json.loads({0})\n  return len(json_statuses['failed']) if 'failed' in json_statuses else 0"
           }
-        }        
+        }
+      },
+      {
+        "name": "namenode_process",
+        "label": "NameNode Process",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "PORT",
+          "uri": "{{hdfs-site/dfs.namenode.http-address}}",
+          "default_port": 50070,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.4f} response on port {1}"
+            },
+            "critical": {
+              "text": "Connection failed: {0} on host {1}:{2}"
+            }
+          }        
+        }
       }
-    }
-  ],
-  "JOURNALNODE": [
-    {
-      "name": "journalnode_process",
-      "label": "JournalNode Process",
-      "interval": 1,
-      "scope": "HOST",
-      "enabled": true,
-      "source": {
-        "type": "PORT",        
-        "uri": "{{hdfs-site/dfs.journalnode.http-address}}",
-        "default_port": 8480,
-        "reporting": {
-          "ok": {
-            "text": "TCP OK - {0:.4f} response on port {1}"
-          },
-          "critical": {
-            "text": "Connection failed: {0} on host {1}:{2}"
-          }
-        }        
+    ],
+    "SECONDARY_NAMENODE": [
+      {
+        "name": "secondary_namenode_process",
+        "label": "Secondary NameNode Process",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "PORT",
+          "uri": "{{hdfs-site/dfs.namenode.secondary.http-address}}",
+          "default_port": 50071,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.4f} response on port {1}"
+            },
+            "critical": {
+              "text": "Connection failed: {0} on host {1}:{2}"
+            }
+          }        
+        }
       }
-    }
-  ],      
-  "DATANODE": [
-    {
-      "name": "datanode_process",
-      "label": "DateNode Process",
-      "interval": 1,
-      "scope": "HOST",
-      "enabled": true,
-      "source": {
-        "type": "PORT",        
-        "uri": "{{hdfs-site/dfs.datanode.address}}",
-        "default_port": 50010,
-        "reporting": {
-          "ok": {
-            "text": "TCP OK - {0:.4f} response on port {1}"
-          },
-          "critical": {
-            "text": "Connection failed: {0} on host {1}:{2}"
-          }
-        }        
+    ],
+    "JOURNALNODE": [
+      {
+        "name": "journalnode_process",
+        "label": "JournalNode Process",
+        "interval": 1,
+        "scope": "HOST",
+        "enabled": true,
+        "source": {
+          "type": "PORT",        
+          "uri": "{{hdfs-site/dfs.journalnode.http-address}}",
+          "default_port": 8480,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.4f} response on port {1}"
+            },
+            "critical": {
+              "text": "Connection failed: {0} on host {1}:{2}"
+            }
+          }        
+        }
       }
-    },
-    {
-      "name": "datanode_webui",
-      "label": "DataNode Web UI",
-      "interval": 1,
-      "scope": "HOST",
-      "enabled": true,
-      "source": {
-        "type": "WEB",
-        "uri": {
-          "http": "{{hdfs-site/dfs.datanode.http.address}}",
-          "https": "{{hdfs-site/dfs.datanode.https.address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "The UI returned a response code of {0}"
-          },
-          "warning":{
-            "text": "The UI returned a response code of {0}"
+    ],      
+    "DATANODE": [
+      {
+        "name": "datanode_process",
+        "label": "DateNode Process",
+        "interval": 1,
+        "scope": "HOST",
+        "enabled": true,
+        "source": {
+          "type": "PORT",        
+          "uri": "{{hdfs-site/dfs.datanode.address}}",
+          "default_port": 50010,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.4f} response on port {1}"
+            },
+            "critical": {
+              "text": "Connection failed: {0} on host {1}:{2}"
+            }
+          }        
+        }
+      },
+      {
+        "name": "datanode_webui",
+        "label": "DataNode Web UI",
+        "interval": 1,
+        "scope": "HOST",
+        "enabled": true,
+        "source": {
+          "type": "WEB",
+          "uri": {
+            "http": "{{hdfs-site/dfs.datanode.http.address}}",
+            "https": "{{hdfs-site/dfs.datanode.https.address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
           },
-          "critical": {
-            "text": "Connection failed to {1}:{2}"
-          }
-        }        
-      }
-    },    
-    {
-      "name": "datanode_storage",
-      "label": "DataNode Storage",
-      "interval": 2,
-      "scope": "HOST",
-      "enabled": true,
-      "source": {
-        "type": "METRIC",
-        "uri": {
-          "http": "{{hdfs-site/dfs.datanode.http.address}}",
-          "https": "{{hdfs-site/dfs.datanode.https.address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]"
+          "reporting": {
+            "ok": {
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "warning":{
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "critical": {
+              "text": "Connection failed to {1}:{2}"
+            }
+          }        
+        }
+      },    
+      {
+        "name": "datanode_storage",
+        "label": "DataNode Storage",
+        "interval": 2,
+        "scope": "HOST",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.datanode.http.address}}",
+            "https": "{{hdfs-site/dfs.datanode.https.address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
           },
-          "warning": {
-            "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
-            "value": 80
+          "reporting": {
+            "ok": {
+              "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]"
+            },
+            "warning": {
+              "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
+              "value": 80
+            },
+            "critical": {
+              "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
+              "value": 90
+            }
           },
-          "critical": {
-            "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
-            "value": 90
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=DataNode,name=FSDatasetState-*/Remaining",
+              "Hadoop:service=DataNode,name=FSDatasetState-*/Capacity"
+            ],
+            "value": "({1} - {0})/{1} * 100"
           }
-        },
-        "jmx": {
-          "property_list": [
-            "Hadoop:service=DataNode,name=FSDatasetState-*/Remaining",
-            "Hadoop:service=DataNode,name=FSDatasetState-*/Capacity"
-          ],
-          "value": "({1} - {0})/{1} * 100"
         }
-      }
-    }    
-  ]
-}
+      }    
+    ]
+  }
+}

+ 310 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/alerts.json

@@ -0,0 +1,310 @@
+{
+  "MAPREDUCE2": {
+    "service": [],
+    "HISTORYSERVER": [
+      {
+        "name": "mapreduce_history_server_webui",
+        "label": "History Server Web UI",
+        "interval": 1,
+        "scope": "ANY",
+        "source": {
+          "type": "WEB",
+          "uri": {
+            "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
+            "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
+            "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
+          },
+          "reporting": {
+            "ok": {
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "warning":{
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "critical": {
+              "text": "Connection failed to {1}:{2}"
+            }
+          }
+        }
+      },
+      {
+        "name": "mapreduce_history_server_cpu",
+        "label": "History Server CPU Utilization",
+        "interval": 5,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
+            "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
+            "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
+          },
+          "reporting": {
+            "ok": {
+              "text": "{1} CPU, load {0:.1%}"
+            },
+            "warning": {
+              "text": "{1} CPU, load {0:.1%}",
+              "value": 200
+            },
+            "critical": {
+              "text": "{1} CPU, load {0:.1%}",
+              "value": 250
+            }
+          },
+          "jmx": {
+            "property_list": [
+              "java.lang:type=OperatingSystem/SystemCpuLoad",
+              "java.lang:type=OperatingSystem/AvailableProcessors"
+            ],
+            "value": "{0} * 100"
+          }
+        }
+      },
+      {
+        "name": "mapreduce_history_server_rpc_latency",
+        "label": "History Server RPC Latency",
+        "interval": 5,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
+            "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
+            "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
+          },
+          "reporting": {
+            "ok": {
+              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]"
+            },
+            "warning": {
+              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
+              "value": 3000
+            },          
+            "critical": {
+              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
+              "value": 5000
+            }
+          },
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=JobHistoryServer,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
+              "Hadoop:service=JobHistoryServer,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
+            ],
+            "value": "{0}"
+          }
+        }
+      },
+      {
+        "name": "mapreduce_history_server_process",
+        "label": "History Server Process",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "PORT",
+          "uri": "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
+          "default_port": 19888,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.4f} response on port {1}"
+            },
+            "critical": {
+              "text": "Connection failed: {0} on host {1}:{2}"
+            }
+          }        
+        }
+      }    
+    ]
+  },
+  "YARN": {
+    "service": [
+      {
+        "name": "yarn_nodemanager_webui_percent",
+        "label": "Percent NodeManagers Available",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "AGGREGATE",
+          "alert_name": "yarn_nodemanager_webui",
+          "reporting": {
+            "ok": {
+              "text": "affected: [{1}], total: [{0}]"
+            },
+            "warning": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.1
+            },
+            "critical": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.3
+            }
+          }
+        }
+      }    
+    ],
+    "NODEMANAGER": [
+      {
+        "name": "yarn_nodemanager_webui",
+        "label": "NodeManager Web UI",
+        "interval": 1,
+        "scope": "HOST",
+        "source": {
+          "type": "WEB",
+          "uri": {
+            "http": "{{yarn-site/yarn.nodemanager.webapp.address}}",
+            "https": "{{yarn-site/yarn.nodemanager.webapp.https.address}}",
+            "https_property": "{{yarn-site/yarn.http.policy}}",
+            "https_property_value": "HTTPS_ONLY",
+            "default_port": 8042
+          },
+          "reporting": {
+            "ok": {
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "warning":{
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "critical": {
+              "text": "Connection failed to {1}:{2}"
+            }
+          }
+        }
+      }
+    ],
+    "RESOURCEMANAGER": [
+      {
+        "name": "yarn_resourcemanager_webui",
+        "label": "ResourceManager Web UI",
+        "interval": 1,
+        "scope": "ANY",
+        "source": {
+          "type": "WEB",
+          "uri": {
+            "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}",
+            "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
+            "https_property": "{{yarn-site/yarn.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
+          },
+          "reporting": {
+            "ok": {
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "warning":{
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "critical": {
+              "text": "Connection failed to {1}:{2}"
+            }
+          }
+        }
+      },
+      {
+        "name": "yarn_resourcemanager_cpu",
+        "label": "ResourceManager CPU Utilization",
+        "interval": 5,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}",
+            "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
+            "https_property": "{{yarn-site/yarn.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
+          },
+          "reporting": {
+            "ok": {
+              "text": "{1} CPU, load {0:.1%}"
+            },
+            "warning": {
+              "text": "{1} CPU, load {0:.1%}",
+              "value": 200
+            },
+            "critical": {
+              "text": "{1} CPU, load {0:.1%}",
+              "value": 250
+            }
+          },
+          "jmx": {
+            "property_list": [
+              "java.lang:type=OperatingSystem/SystemCpuLoad",
+              "java.lang:type=OperatingSystem/AvailableProcessors"
+            ],
+            "value": "{0} * 100"
+          }
+        }
+      },
+      {
+        "name": "yarn_resourcemanager_rpc_latency",
+        "label": "ResourceManager RPC Latency",
+        "interval": 5,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}",
+            "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
+            "https_property": "{{yarn-site/yarn.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
+          },
+          "reporting": {
+            "ok": {
+              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]"
+            },
+            "warning": {
+              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
+              "value": 3000
+            },          
+            "critical": {
+              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
+              "value": 5000
+            }
+          },
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
+              "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
+            ],
+            "value": "{0}"
+          }
+        }
+      }
+    ],
+    "APP_TIMELINE_SERVER": [
+      {
+        "name": "yarn_app_timeline_server_webui",
+        "label": "App Timeline Web UI",
+        "interval": 1,
+        "scope": "ANY",
+        "source": {
+          "type": "WEB",
+          "uri": {
+            "http": "{{yarn-site/yarn.timeline-service.webapp.address}}",
+            "https": "{{yarn-site/yarn.timeline-service.webapp.https.address}}",
+            "https_property": "{{yarn-site/yarn.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
+          },
+          "reporting": {
+            "ok": {
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "warning":{
+              "text": "HTTP {0} response in {3:.4f} seconds"
+            },
+            "critical": {
+              "text": "Connection failed to {1}:{2}"
+            }
+          }
+        }
+      }
+    ]
+  }
+}

+ 51 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/ZOOKEEPER/alerts.json

@@ -0,0 +1,51 @@
+{
+  "ZOOKEEPER": {
+    "service": [
+      {
+        "name": "zookeeper_server_process_percent",
+        "label": "Percent ZooKeeper Servers Available",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "AGGREGATE",
+          "alert_name": "zookeeper_server_process",
+          "reporting": {
+            "ok": {
+              "text": "affected: [{1}], total: [{0}]"
+            },
+            "warning": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.35
+            },
+            "critical": {
+              "text": "affected: [{1}], total: [{0}]",
+              "value": 0.70
+            }
+          }
+        }
+      }  
+    ],
+    "ZOOKEEPER_SERVER": [
+      {
+        "name": "zookeeper_server_process",
+        "label": "ZooKeeper Server Process",
+        "interval": 1,
+        "scope": "ANY",
+        "source": {
+          "type": "PORT",
+          "uri": "{{zookeeper-env/clientPort}}",
+          "default_port": 2181,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.4f} response on port {1}"
+            },
+            "critical": {
+              "text": "Connection failed: {0} on host {1}:{2}"
+            }
+          }        
+        }
+      }
+    ]
+  }
+}

+ 1 - 0
ambari-server/src/test/java/org/apache/ambari/server/api/services/AmbariMetaInfoTest.java

@@ -1636,6 +1636,7 @@ public class AmbariMetaInfoTest {
     assertNotNull( metricSource.getUri().getHttpsPropertyValue() );
     assertNotNull( metricSource.getUri().getHttpsUri() );
     assertNotNull( metricSource.getUri().getHttpUri() );
+    assertEquals(12345, metricSource.getUri().getDefaultPort());
   }
 
   /**

+ 114 - 111
ambari-server/src/test/resources/stacks/HDP/2.0.5/services/HDFS/alerts.json

@@ -1,123 +1,126 @@
 {
-  "service": [
-  ],
-  "NAMENODE": [
-    {
-      "name": "namenode_cpu",
-      "label": "NameNode Host CPU Utilization",
-      "interval": 2,
-      "scope": "ANY",
-      "enabled": true,
-      "source": {
-        "type": "METRIC",
-        "uri": {
-          "http": "{{hdfs-site/dfs.namenode.http-address}}",
-          "https": "{{hdfs-site/dfs.namenode.https-address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "{1} CPU, load {0:.1%}"
+  "HDFS": {  
+    "service": [
+    ],
+    "NAMENODE": [
+      {
+        "name": "namenode_cpu",
+        "label": "NameNode Host CPU Utilization",
+        "interval": 2,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.namenode.http-address}}",
+            "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
           },
-          "warning": {
-            "text": "{1} CPU, load {0:.1%}",
-            "value": 200
+          "reporting": {
+            "ok": {
+              "text": "{1} CPU, load {0:.1%}"
+            },
+            "warning": {
+              "text": "{1} CPU, load {0:.1%}",
+              "value": 200
+            },
+            "critical": {
+              "text": "{1} CPU, load {0:.1%}",
+              "value": 250
+            }
           },
-          "critical": {
-            "text": "{1} CPU, load {0:.1%}",
-            "value": 250
+          "jmx": {
+            "property_list": [
+              "java.lang:type=OperatingSystem/SystemCpuLoad",
+              "java.lang:type=OperatingSystem/AvailableProcessors"
+            ],
+            "value": "{0} * 100"
           }
-        },
-        "jmx": {
-          "property_list": [
-            "java.lang:type=OperatingSystem/SystemCpuLoad",
-            "java.lang:type=OperatingSystem/AvailableProcessors"
-          ],
-          "value": "{0} * 100"
+        }
+      },
+      {
+        "name": "namenode_process",
+        "label": "NameNode process",
+        "interval": 1,
+        "scope": "ANY",
+        "source": {
+          "type": "PORT",
+          "uri": "{{hdfs-site/dfs.namenode.http-address}}",
+          "default_port": 50070,
+          "reporting": {
+            "ok": {
+              "text": "TCP OK - {0:.4f} response on port {1}"
+            },
+            "critical": {
+              "text": "TCP FAIL - {0:.4f} response on port {1}"
+            }
+          }        
+        }
+      },
+      {
+        "name": "hdfs_last_checkpoint",
+        "label": "Last Checkpoint Time",
+        "interval": 1,
+        "SCOPE": "service",
+        "enabled": false,
+        "source": {
+          "type": "SCRIPT",
+          "path": "scripts/alerts/last_checkpoint.py"
         }
       }
-    },
-    {
-      "name": "namenode_process",
-      "label": "NameNode process",
-      "interval": 1,
-      "scope": "ANY",
-      "source": {
-        "type": "PORT",
-        "uri": "{{hdfs-site/dfs.namenode.http-address}}",
-        "default_port": 50070,
-        "reporting": {
-          "ok": {
-            "text": "TCP OK - {0:.4f} response on port {1}"
-          },
-          "critical": {
-            "text": "TCP FAIL - {0:.4f} response on port {1}"
-          }
-        }        
-      }
-    },
-    {
-      "name": "hdfs_last_checkpoint",
-      "label": "Last Checkpoint Time",
-      "interval": 1,
-      "SCOPE": "service",
-      "enabled": false,
-      "source": {
-        "type": "SCRIPT",
-        "path": "scripts/alerts/last_checkpoint.py"
-      }
-    }
-  ],
-  "SECONDARY_NAMENODE": [
-    {
-      "name": "secondary_namenode_process",
-      "label": "Secondary NameNode process",
-      "interval": 1,
-      "scope": "ANY",
-      "source": {
-        "type": "PORT",        
-        "uri": "{{hdfs-site/dfs.namenode.secondary.http-address}}",
-        "default_port": 50070
+    ],
+    "SECONDARY_NAMENODE": [
+      {
+        "name": "secondary_namenode_process",
+        "label": "Secondary NameNode process",
+        "interval": 1,
+        "scope": "ANY",
+        "source": {
+          "type": "PORT",        
+          "uri": "{{hdfs-site/dfs.namenode.secondary.http-address}}",
+          "default_port": 50070
+        }
       }
-    }
-  ],  
-  "DATANODE": [
-    {
-      "name": "datanode_storage",
-      "label": "DataNode Storage",
-      "interval": 2,
-      "scope": "HOST",
-      "enabled": true,
-      "source": {
-        "type": "METRIC",
-        "uri": {
-          "http": "{{hdfs-site/dfs.datanode.http.address}}",
-          "https": "{{hdfs-site/dfs.datanode.https.address}}",
-          "https_property": "{{hdfs-site/dfs.http.policy}}",
-          "https_property_value": "HTTPS_ONLY"
-        },
-        "reporting": {
-          "ok": {
-            "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]"
+    ],  
+    "DATANODE": [
+      {
+        "name": "datanode_storage",
+        "label": "DataNode Storage",
+        "interval": 2,
+        "scope": "HOST",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.datanode.http.address}}",
+            "https": "{{hdfs-site/dfs.datanode.https.address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY",
+            "default_port": "12345"
           },
-          "warning": {
-            "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
-            "value": 80
+          "reporting": {
+            "ok": {
+              "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]"
+            },
+            "warning": {
+              "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
+              "value": 80
+            },
+            "critical": {
+              "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
+              "value": 90
+            }
           },
-          "critical": {
-            "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
-            "value": 90
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=DataNode,name=FSDatasetState-*/Remaining",
+              "Hadoop:service=DataNode,name=FSDatasetState-*/Capacity"
+            ],
+            "value": "({1} - {0})/{1} * 100"
           }
-        },
-        "jmx": {
-          "property_list": [
-            "Hadoop:service=DataNode,name=FSDatasetState-*/Remaining",
-            "Hadoop:service=DataNode,name=FSDatasetState-*/Capacity"
-          ],
-          "value": "({1} - {0})/{1} * 100"
         }
-      }
-    }  
-  ]
+      }  
+    ]
+  }
 }