ソースを参照

AMBARI-8569 - Alert JSON Files Need Descriptions (jonathanhurley)

Jonathan Hurley 10 年 前
コミット
e551069762
32 ファイル変更268 行追加55 行削除
  1. 20 4
      ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py
  2. 11 5
      ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py
  3. 12 0
      ambari-agent/src/main/python/ambari_agent/alerts/port_alert.py
  4. 13 1
      ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
  5. 13 2
      ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py
  6. 66 25
      ambari-agent/src/test/python/ambari_agent/TestAlerts.py
  7. 1 0
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json
  8. 5 0
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/GANGLIA/alerts.json
  9. 4 0
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HBASE/alerts.json
  10. 18 0
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json
  11. 2 0
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/alerts.json
  12. 2 0
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/alerts.json
  13. 10 0
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/alerts.json
  14. 2 0
      ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/ZOOKEEPER/alerts.json
  15. 5 0
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/GANGLIA/alerts.json
  16. 4 0
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HBASE/alerts.json
  17. 15 0
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HDFS/alerts.json
  18. 3 0
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/alerts.json
  19. 8 0
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/MAPREDUCE/alerts.json
  20. 2 0
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/OOZIE/alerts.json
  21. 2 0
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/ZOOKEEPER/alerts.json
  22. 1 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/FLUME/alerts.json
  23. 5 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/GANGLIA/alerts.json
  24. 4 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/alerts.json
  25. 18 18
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/alerts.json
  26. 3 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HIVE/alerts.json
  27. 2 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/OOZIE/alerts.json
  28. 11 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/alerts.json
  29. 2 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/ZOOKEEPER/alerts.json
  30. 2 0
      ambari-server/src/main/resources/stacks/HDP/2.1/services/FALCON/alerts.json
  31. 1 0
      ambari-server/src/main/resources/stacks/HDP/2.2/services/KAFKA/alerts.json
  32. 1 0
      ambari-server/src/main/resources/stacks/HDP/2.2/services/KNOX/alerts.json

+ 20 - 4
ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py

@@ -86,7 +86,7 @@ class BaseAlert(object):
     """ method used for collection.  defers to _collect() """
     
     res = (BaseAlert.RESULT_UNKNOWN, [])
-    res_base_text = "{0}"
+    res_base_text = None
     
     try:
       res = self._collect()
@@ -102,9 +102,15 @@ class BaseAlert(object):
 
         return
 
+      # it's possible that the alert definition doesn't have reporting; safely
+      # check for it and fallback to default text if it doesn't exist
+      if ('reporting' in self.alert_source_meta) and \
+          (reporting_state in self.alert_source_meta['reporting']) and \
+          ('text' in self.alert_source_meta['reporting'][reporting_state]):
+          res_base_text = self.alert_source_meta['reporting'][reporting_state]['text']
 
-      if reporting_state in self.alert_source_meta['reporting']:
-        res_base_text = self.alert_source_meta['reporting'][reporting_state]['text']
+      if res_base_text is None:
+        res_base_text = self._get_reporting_text(result_state)
 
     except Exception as e:
       message = "Unable to run alert {0}".format(str(self.alert_meta['name']))
@@ -304,6 +310,16 @@ class BaseAlert(object):
     """  
     raise NotImplementedError
 
+
+  def _get_reporting_text(self, state):
+    '''
+    Gets the default reporting text to use when the alert definition does not
+    contain any. Subclasses can override this to return specific text.
+    :param state: the state of the alert in uppercase (such as OK, WARNING, etc)
+    :return:  the parameterized text
+    '''
+    return '{0}'
+
   """
   See RFC3986, Appendix B
   Tested on the following cases:
@@ -312,7 +328,7 @@ class BaseAlert(object):
     "hdfs://192.168.54.3/foo/bar"
     "ftp://192.168.54.4:7842/foo/bar"
 
-    Returns None if only a port is passsed in
+    Returns None if only a port is passed in
   """
   @staticmethod
   def get_host_from_url(uri):

+ 11 - 5
ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py

@@ -159,6 +159,17 @@ class MetricAlert(BaseAlert):
         
     return value_list
 
+  def _get_reporting_text(self, state):
+    '''
+    Always returns {0} since the result of the script alert is a rendered string.
+    This will ensure that the base class takes the result string and just uses
+    it directly.
+
+    :param state: the state of the alert in uppercase (such as OK, WARNING, etc)
+    :return:  the parameterized text
+    '''
+    return '{0}'
+
     
 class JmxMetric:
   def __init__(self, jmx_info):
@@ -184,8 +195,3 @@ class JmxMetric:
     if self.custom_module is not None:
       return self.custom_module.f(args)
     return None
-    
-      
-    
-  
-    

+ 12 - 0
ambari-agent/src/main/python/ambari_agent/alerts/port_alert.py

@@ -91,3 +91,15 @@ class PortAlert(BaseAlert):
         except:
           # no need to log a close failure
           pass
+
+  def _get_reporting_text(self, state):
+    '''
+    Gets the default reporting text to use when the alert definition does not
+    contain any.
+    :param state: the state of the alert in uppercase (such as OK, WARNING, etc)
+    :return:  the parameterized text
+    '''
+    if state == self.RESULT_OK:
+      return 'TCP OK - {0:.4f} response on port {1}'
+
+    return 'Connection failed: {0} to {1}:{2}'

+ 13 - 1
ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py

@@ -112,4 +112,16 @@ class ScriptAlert(BaseAlert):
       logger.error("Unable to execute script {0}".format(path_to_script))
       return None
     
-    return imp.load_source(self._find_value('name'), path_to_script)
+    return imp.load_source(self._find_value('name'), path_to_script)
+
+
+  def _get_reporting_text(self, state):
+    '''
+    Always returns {0} since the result of the script alert is a rendered string.
+    This will ensure that the base class takes the result string and just uses
+    it directly.
+
+    :param state: the state of the alert in uppercase (such as OK, WARNING, etc)
+    :return:  the parameterized text
+    '''
+    return '{0}'

+ 13 - 2
ambari-agent/src/main/python/ambari_agent/alerts/web_alert.py

@@ -117,5 +117,16 @@ class WebAlert(BaseAlert):
       return WebResponse(status_code=0, time_millis=0)
     
     return WebResponse(status_code=response.getcode(), time_millis=time_millis) 
-  
-  
+
+
+  def _get_reporting_text(self, state):
+    '''
+    Gets the default reporting text to use when the alert definition does not
+    contain any.
+    :param state: the state of the alert in uppercase (such as OK, WARNING, etc)
+    :return:  the parameterized text
+    '''
+    if state == self.RESULT_CRITICAL:
+      return 'Connection failed to {1}'
+
+    return 'HTTP {0} response in {2:.4f} seconds'

+ 66 - 25
ambari-agent/src/test/python/ambari_agent/TestAlerts.py

@@ -74,10 +74,10 @@ class TestAlerts(TestCase):
         "default_port": 50070,
         "reporting": {
           "ok": {
-            "text": "TCP OK - {0:.4f} response time on port {1}"
+            "text": "(Unit Tests) TCP OK - {0:.4f} response time on port {1}"
           },
           "critical": {
-            "text": "Could not load process info: {0}"
+            "text": "(Unit Tests) Could not load process info: {0}"
           }
         }
       }
@@ -107,10 +107,10 @@ class TestAlerts(TestCase):
         "default_port": 50070,
         "reporting": {
           "ok": {
-            "text": "TCP OK - {0:.4f} response time on port {1}"
+            "text": "(Unit Tests) TCP OK - {0:.4f} response time on port {1}"
           },
           "critical": {
-            "text": "Could not load process info: {0}"
+            "text": "(Unit Tests) Could not load process info: {0}"
           }
         }
       }
@@ -131,6 +131,7 @@ class TestAlerts(TestCase):
     self.assertEquals(0, len(collector.alerts()))
     
     self.assertEquals('OK', alerts[0]['state'])
+    self.assertTrue('(Unit Tests)' in alerts[0]['text'])
     self.assertTrue('response time on port 2181' in alerts[0]['text'])
 
 
@@ -149,10 +150,10 @@ class TestAlerts(TestCase):
         "default_port": 50070,
         "reporting": {
           "ok": {
-            "text": "TCP OK - {0:.4f} response time on port {1}"
+            "text": "(Unit Tests) TCP OK - {0:.4f} response time on port {1}"
           },
           "critical": {
-            "text": "Could not load process info: {0}"
+            "text": "(Unit Tests) Could not load process info: {0}"
           }
         }
       }
@@ -226,14 +227,14 @@ class TestAlerts(TestCase):
         },
         "reporting": {
           "ok": {
-            "text": "ok_arr: {0} {1} {2}",
+            "text": "(Unit Tests) ok_arr: {0} {1} {2}",
           },
           "warning": {
             "text": "",
             "value": 13
           },
           "critical": {
-            "text": "crit_arr: {0} {1} {2}",
+            "text": "(Unit Tests) crit_arr: {0} {1} {2}",
             "value": 72
           }
         }
@@ -251,7 +252,7 @@ class TestAlerts(TestCase):
     self.assertEquals(0, len(collector.alerts()))
     
     self.assertEquals('CRITICAL', alerts[0]['state'])
-    self.assertEquals('crit_arr: 1 3 223', alerts[0]['text'])
+    self.assertEquals('(Unit Tests) crit_arr: 1 3 223', alerts[0]['text'])
 
     del json['source']['jmx']['value']
     collector = AlertCollector()
@@ -263,7 +264,7 @@ class TestAlerts(TestCase):
     self.assertEquals(0, len(collector.alerts()))
 
     self.assertEquals('OK', alerts[0]['state'])
-    self.assertEquals('ok_arr: 1 3 None', alerts[0]['text'])
+    self.assertEquals('(Unit Tests) ok_arr: 1 3 None', alerts[0]['text'])
 
 
   @patch.object(MetricAlert, "_load_jmx")
@@ -294,14 +295,14 @@ class TestAlerts(TestCase):
         },
         "reporting": {
           "ok": {
-            "text": "ok_arr: {0} {1} {2}",
+            "text": "(Unit Tests) ok_arr: {0} {1} {2}",
           },
           "warning": {
             "text": "",
             "value": 10
           },
           "critical": {
-            "text": "crit_arr: {0} {1} {2}",
+            "text": "(Unit Tests) crit_arr: {0} {1} {2}",
             "value": 20
           }
         }
@@ -377,13 +378,13 @@ class TestAlerts(TestCase):
         },
         "reporting": {
           "ok": {
-            "text": "ok: {0}",
+            "text": "(Unit Tests) ok: {0}",
           },
           "warning": {
-            "text": "warning: {0}",
+            "text": "(Unit Tests) warning: {0}",
           },
           "critical": {
-            "text": "critical: {1}",
+            "text": "(Unit Tests) critical: {1}",
           }
         }
       }
@@ -402,7 +403,7 @@ class TestAlerts(TestCase):
     self.assertEquals(0, len(collector.alerts()))
 
     self.assertEquals('OK', alerts[0]['state'])
-    self.assertEquals('ok: 200', alerts[0]['text'])
+    self.assertEquals('(Unit Tests) ok: 200', alerts[0]['text'])
 
     # run the alert and check HTTP 500
     wa_make_web_request_mock.return_value = WebResponse(500,1.234)
@@ -415,7 +416,7 @@ class TestAlerts(TestCase):
     self.assertEquals(0, len(collector.alerts()))
     
     self.assertEquals('WARNING', alerts[0]['state'])
-    self.assertEquals('warning: 500', alerts[0]['text'])
+    self.assertEquals('(Unit Tests) warning: 500', alerts[0]['text'])
 
     # run the alert and check critical
     wa_make_web_request_mock.return_value = WebResponse(0,0)
@@ -430,7 +431,7 @@ class TestAlerts(TestCase):
     
     # http assertion indicating that we properly determined non-SSL
     self.assertEquals('CRITICAL', alerts[0]['state'])
-    self.assertEquals('critical: http://1.2.3.4:80', alerts[0]['text'])
+    self.assertEquals('(Unit Tests) critical: http://1.2.3.4:80', alerts[0]['text'])
      
     collector = AlertCollector()
     alert = WebAlert(json, json['source'])
@@ -446,7 +447,7 @@ class TestAlerts(TestCase):
     
     # SSL assertion
     self.assertEquals('CRITICAL', alerts[0]['state'])
-    self.assertEquals('critical: https://1.2.3.4:8443', alerts[0]['text'])
+    self.assertEquals('(Unit Tests) critical: https://1.2.3.4:8443', alerts[0]['text'])
 
   def test_reschedule(self):
     test_file_path = os.path.join('ambari_agent', 'dummy_files')
@@ -476,10 +477,10 @@ class TestAlerts(TestCase):
         "default_port": 50070,
         "reporting": {
           "ok": {
-            "text": "TCP OK - {0:.4f} response time on port {1}"
+            "text": "(Unit Tests) TCP OK - {0:.4f} response time on port {1}"
           },
           "critical": {
-            "text": "Could not load process info: {0}"
+            "text": "(Unit Tests) Could not load process info: {0}"
           }
         }
       }
@@ -527,10 +528,10 @@ class TestAlerts(TestCase):
         "default_port": 50070,
         "reporting": {
           "ok": {
-            "text": "TCP OK - {0:.4f} response time on port {1}"
+            "text": "(Unit Tests) TCP OK - {0:.4f} response time on port {1}"
           },
           "critical": {
-            "text": "Could not load process info: {0}"
+            "text": "(Unit Tests) Could not load process info: {0}"
           }
         }
       }
@@ -584,10 +585,10 @@ class TestAlerts(TestCase):
             "default_port": 50070,
             "reporting": {
               "ok": {
-                "text": "TCP OK - {0:.4f} response time on port {1}"
+                "text": "(Unit Tests) TCP OK - {0:.4f} response time on port {1}"
               },
               "critical": {
-                "text": "Could not load process info: {0}"
+                "text": "(Unit Tests) Could not load process info: {0}"
               }
             }
           }
@@ -631,3 +632,43 @@ class TestAlerts(TestCase):
 
     # ensure that it was skipped
     self.assertEquals(0,len(collector.alerts()))
+
+
+  def test_default_reporting_text(self):
+    json = {
+      "name": "namenode_process",
+      "service": "HDFS",
+      "component": "NAMENODE",
+      "label": "NameNode process",
+      "interval": 6,
+      "scope": "host",
+      "enabled": True,
+      "uuid": "c1f73191-4481-4435-8dae-fd380e4c0be1",
+      "source": {
+        "type": "SCRIPT",
+        "path": "test_script.py",
+      }
+    }
+
+    alert = ScriptAlert(json, json['source'])
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_OK), '{0}')
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_WARNING), '{0}')
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_CRITICAL), '{0}')
+
+    json['source']['type'] = 'PORT'
+    alert = PortAlert(json, json['source'])
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_OK), 'TCP OK - {0:.4f} response on port {1}')
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_WARNING), 'Connection failed: {0} to {1}:{2}')
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_CRITICAL), 'Connection failed: {0} to {1}:{2}')
+
+    json['source']['type'] = 'WEB'
+    alert = WebAlert(json, json['source'])
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_OK), 'HTTP {0} response in {2:.4f} seconds')
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_WARNING), 'HTTP {0} response in {2:.4f} seconds')
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_CRITICAL), 'Connection failed to {1}')
+
+    json['source']['type'] = 'METRIC'
+    alert = MetricAlert(json, json['source'])
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_OK), '{0}')
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_WARNING), '{0}')
+    self.assertEquals(alert._get_reporting_text(alert.RESULT_CRITICAL), '{0}')

+ 1 - 0
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "flume_agent_status",
         "label": "Flume Agent Status",
+        "description": "This host-level alert is triggerd if any of the expected flume agent processes are not available.", 
         "interval": 1,
         "scope": "ANY",
         "source": {

+ 5 - 0
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/GANGLIA/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "ganglia_server_process",
         "label": "Ganglia Server Process",
+        "description": "This host-level alert is triggered if the Ganglia server process cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -25,6 +26,7 @@
       {
         "name": "ganglia_monitor_hdfs_namenode",
         "label": "Ganglia NameNode Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for HDFS NameNode is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -45,6 +47,7 @@
       {
         "name": "ganglia_monitor_hbase_master",
         "label": "Ganglia HBase Master Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for the HBase Master process is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -65,6 +68,7 @@
       {
         "name": "ganglia_monitor_yarn_resourcemanager",
         "label": "Ganglia ResourceManager Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for the YARN ResourceManager process is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -85,6 +89,7 @@
       {
         "name": "ganglia_monitor_mapreduce_history_server",
         "label": "Ganglia History Server Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for the MapReduce History Server process is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 4 - 0
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HBASE/alerts.json

@@ -4,6 +4,7 @@
       {
         "name": "hbase_regionserver_process_percent",
         "label": "Percent RegionServers Available",
+        "description": "This service-level alert is triggered if the configured percentage of RegionServer processes cannot be determined to be up and listening on the network for the configured warning and critical thresholds. It aggregates the results of RegionServer process down checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -30,6 +31,7 @@
       {
         "name": "hbase_master_process",
         "label": "HBase Master Process",
+        "description": "This alert is triggered if the HBase master processes cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -49,6 +51,7 @@
       {
         "name": "hbase_master_cpu",
         "label": "HBase Maser CPU Utilization",
+        "description": "This host-level alert is triggered if CPU utilization of the HBase Master exceeds certain warning and critical thresholds. It checks the HBase Master JMX Servlet for the SystemCPULoad property.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -88,6 +91,7 @@
       {
         "name": "hbase_regionserver_process",
         "label": "HBase RegionServer Process",
+        "description": "This host-level alert is triggered if the RegionServer processes cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "HOST",
         "source": {

+ 18 - 0
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json

@@ -4,6 +4,7 @@
       {
         "name": "datanode_process_percent",
         "label": "Percent DataNodes Available",
+        "description": "This alert is triggered if the number of down DataNodes in the cluster is greater than the configured critical threshold. It aggregates the results of DataNode process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -28,6 +29,7 @@
       {
         "name": "datanode_storage_percent",
         "label": "Percent DataNodes With Available Space",
+        "description": "This service-level alert is triggered if the storage on a certain percentage of DataNodes exceeds either the warning or critical threshold values.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -52,6 +54,7 @@
       {
         "name": "journalnode_process_percent",
         "label": "Percent JournalNodes Available",
+        "description": "This alert is triggered if the number of down JournalNodes in the cluster is greater than the configured critical threshold. It aggregates the results of JournalNode process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -78,6 +81,7 @@
       {
         "name": "namenode_webui",
         "label": "NameNode Web UI",
+        "description": "This host-level alert is triggered if the NameNode Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -105,6 +109,7 @@
       {
         "name": "namenode_cpu",
         "label": "NameNode Host CPU Utilization",
+        "description": "This host-level alert is triggered if CPU utilization of the NameNode exceeds certain warning and critical thresholds. It checks the NameNode JMX Servlet for the SystemCPULoad property.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -141,6 +146,7 @@
       {
         "name": "namenode_hdfs_blocks_health",
         "label": "NameNode Blocks Health",
+        "description": "This service-level alert is triggered if the number of corrupt or missing blocks exceeds the configured critical threshold.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -177,6 +183,7 @@
       {
         "name": "namenode_hdfs_capacity_utilization",
         "label": "HDFS Capacity Utilization",
+        "description": "This service-level alert is triggered if the HDFS capacity utilization exceeds the configured warning and critical thresholds. It checks the NameNode JMX Servlet for the CapacityUsed and CapacityRemaining properties.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -213,6 +220,7 @@
       {
         "name": "namenode_rpc_latency",
         "label": "NameNode RPC Latency",
+        "description": "This host-level alert is triggered if the NameNode RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for NameNode operations.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -249,6 +257,7 @@
       {
         "name": "namenode_directory_status",
         "label": "NameNode Directory Status",
+        "description": "This host-level alert is triggered if any of the the NameNode's NameDirStatuses metric reports a failed directory.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -284,6 +293,7 @@
       {
         "name": "namenode_process",
         "label": "NameNode Process",
+        "description": "This host-level alert is triggered if the NameNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -304,6 +314,7 @@
       {
         "name": "namenode_last_checkpoint",
         "label": "NameNode Last Checkpoint",
+        "description": "This service-level alert will trigger if the last time that the NameNode performed a checkpoint was too long ago. It will also trigger if the number of uncommitted transactions is beyond a certain threshold.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -315,6 +326,7 @@
       {
         "name": "namenode_ha_health",
         "label": "NameNode High Availability Health",
+        "description": "This service-level alert is triggered if either the Active NameNode or Standby NameNode are not running.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -329,6 +341,7 @@
       {
         "name": "secondary_namenode_process",
         "label": "Secondary NameNode Process",
+        "description": "This host-level alert is triggered if the Secondary NameNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -351,6 +364,7 @@
       {
         "name": "journalnode_process",
         "label": "JournalNode Process",
+        "description": "This host-level alert is triggered if the JournalNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -373,6 +387,7 @@
       {
         "name": "datanode_process",
         "label": "DataNode Process",
+        "description": "This host-level alert is triggered if the individual DataNode processes cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -393,6 +408,7 @@
       {
         "name": "datanode_webui",
         "label": "DataNode Web UI",
+        "description": "This host-level alert is triggered if the DataNode Web UI is unreachable.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -420,6 +436,7 @@
       {
         "name": "datanode_storage",
         "label": "DataNode Storage",
+        "description": "This host-level alert is triggered if storage capacity if full on the DataNode. It checks the DataNode JMX Servlet for the Capacity and Remaining properties.",
         "interval": 2,
         "scope": "HOST",
         "enabled": true,
@@ -458,6 +475,7 @@
       {
         "name": "hdfs_zookeeper_failover_controller_process",
         "label": "ZooKeeper Failover Controller Process",
+        "description": "This host-level alert is triggered if the ZooKeeper Failover Controller process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 2 - 0
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "hive_metastore_process",
         "label": "Hive Metastore Process",
+        "description": "This host-level alert is triggered if the Hive Metastore process cannot be determined to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -26,6 +27,7 @@
       {
         "name": "hive_server_process",
         "label": "HiveServer2 Process",
+        "description": "This host-level alert is triggered if the HiveServer cannot be determined to be up and responding to client requests.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 2 - 0
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "oozie_server_webui",
         "label": "Oozie Server Web UI",
+        "description": "This host-level alert is triggered if the Oozie server Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -28,6 +29,7 @@
       {
         "name": "oozie_server_status",
         "label": "Oozie Server Status",
+        "description": "This host-level alert is triggered if the Oozie server cannot be determined to be up and responding to client requests.",
         "interval": 1,
         "scope": "ANY",
         "source": {

+ 10 - 0
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "mapreduce_history_server_webui",
         "label": "History Server Web UI",
+        "description": "This host-level alert is triggered if the History Server Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -31,6 +32,7 @@
       {
         "name": "mapreduce_history_server_cpu",
         "label": "History Server CPU Utilization",
+        "description": "This host-level alert is triggered if the percent of CPU utilization on the History Server exceeds the configured critical threshold.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -67,6 +69,7 @@
       {
         "name": "mapreduce_history_server_rpc_latency",
         "label": "History Server RPC Latency",
+        "description": "This host-level alert is triggered if the History Server operations RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for operations.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -103,6 +106,7 @@
       {
         "name": "mapreduce_history_server_process",
         "label": "History Server Process",
+        "description": "This host-level alert is triggered if the History Server process cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -127,6 +131,7 @@
       {
         "name": "yarn_nodemanager_webui_percent",
         "label": "Percent NodeManagers Available",
+        "description": "This alert is triggered if the number of down NodeManagers in the cluster is greater than the configured critical threshold. It aggregates the results of NodeManager process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -153,6 +158,7 @@
       {
         "name": "yarn_nodemanager_webui",
         "label": "NodeManager Web UI",
+        "description": "This host-level alert is triggered if the NodeManager Web UI is unreachable.",
         "interval": 1,
         "scope": "HOST",
         "source": {
@@ -180,6 +186,7 @@
       {
         "name": "yarn_nodemanager_health",
         "label": "NodeManager Health",
+        "description": "This host-level alert checks the node health property available from the NodeManager component.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -193,6 +200,7 @@
       {
         "name": "yarn_resourcemanager_webui",
         "label": "ResourceManager Web UI",
+        "description": "This host-level alert is triggered if the ResourceManager Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -219,6 +227,7 @@
       {
         "name": "yarn_resourcemanager_cpu",
         "label": "ResourceManager CPU Utilization",
+        "description": "This host-level alert is triggered if CPU utilization of the ResourceManager exceeds certain warning and critical thresholds. It checks the ResourceManager JMX Servlet for the SystemCPULoad property.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -255,6 +264,7 @@
       {
         "name": "yarn_resourcemanager_rpc_latency",
         "label": "ResourceManager RPC Latency",
+        "description": "This host-level alert is triggered if the ResourceManager operations RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for ResourceManager operations.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,

+ 2 - 0
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/ZOOKEEPER/alerts.json

@@ -4,6 +4,7 @@
       {
         "name": "zookeeper_server_process_percent",
         "label": "Percent ZooKeeper Servers Available",
+        "description": "This service-level alert is triggered if the configured percentage of ZooKeeper processes cannot be determined to be up and listening on the network.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -30,6 +31,7 @@
       {
         "name": "zookeeper_server_process",
         "label": "ZooKeeper Server Process",
+        "description": "This host-level alert is triggered if the ZooKeeper server process cannot be determined to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "source": {

+ 5 - 0
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/GANGLIA/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "ganglia_server_process",
         "label": "Ganglia Server Process",
+        "description": "This host-level alert is triggered if the Ganglia server process cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -25,6 +26,7 @@
       {
         "name": "ganglia_monitor_hdfs_namenode",
         "label": "Ganglia NameNode Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for HDFS NameNode is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -45,6 +47,7 @@
       {
         "name": "ganglia_monitor_hbase_master",
         "label": "Ganglia HBase Master Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for the HBase Master process is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -65,6 +68,7 @@
       {
         "name": "ganglia_monitor_mapreduce_jobtracker",
         "label": "Ganglia JobTracker Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for the YARN ResourceManager process is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -85,6 +89,7 @@
       {
         "name": "ganglia_monitor_mapreduce_history_server",
         "label": "Ganglia History Server Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for the MapReduce History Server process is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 4 - 0
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HBASE/alerts.json

@@ -4,6 +4,7 @@
       {
         "name": "hbase_regionserver_process_percent",
         "label": "Percent RegionServers Available",
+        "description": "This service-level alert is triggered if the configured percentage of RegionServer processes cannot be determined to be up and listening on the network for the configured warning and critical thresholds. It aggregates the results of RegionServer process down checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -30,6 +31,7 @@
       {
         "name": "hbase_master_process",
         "label": "HBase Master Process",
+        "description": "This alert is triggered if the HBase master processes cannot be confirmed to be up and listening on the network for the configured critical threshold, given in seconds.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -49,6 +51,7 @@
       {
         "name": "hbase_master_cpu",
         "label": "HBase Maser CPU Utilization",
+        "description": "This host-level alert is triggered if CPU utilization of the HBase Master exceeds certain warning and critical thresholds. It checks the HBase Master JMX Servlet for the SystemCPULoad property.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -88,6 +91,7 @@
       {
         "name": "hbase_regionserver_process",
         "label": "HBase RegionServer Process",
+        "description": "This host-level alert is triggered if the RegionServer processes cannot be confirmed to be up and listening on the network for the configured critical threshold, given in seconds.",
         "interval": 1,
         "scope": "HOST",
         "source": {

+ 15 - 0
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HDFS/alerts.json

@@ -4,6 +4,7 @@
       {
         "name": "datanode_process_percent",
         "label": "Percent DataNodes Available",
+        "description": "This alert is triggered if the number of down DataNodes in the cluster is greater than the configured critical threshold. It aggregates the results of DataNode process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -28,6 +29,7 @@
       {
         "name": "datanode_storage_percent",
         "label": "Percent DataNodes With Available Space",
+        "description": "This service-level alert is triggered if the storage on a certain percentage of DataNodes exceeds either the warning or critical threshold values.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -52,6 +54,7 @@
       {
         "name": "journalnode_process_percent",
         "label": "Percent JournalNodes Available",
+        "description": "This alert is triggered if the number of down JournalNodes in the cluster is greater than the configured critical threshold. It aggregates the results of JournalNode process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -78,6 +81,7 @@
       {
         "name": "namenode_webui",
         "label": "NameNode Web UI",
+        "description": "This host-level alert is triggered if the NameNode Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -105,6 +109,7 @@
       {
         "name": "namenode_cpu",
         "label": "NameNode Host CPU Utilization",
+        "description": "This host-level alert is triggered if CPU utilization of the NameNode exceeds certain warning and critical thresholds. It checks the NameNode JMX Servlet for the SystemCPULoad property.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -141,6 +146,7 @@
       {
         "name": "namenode_hdfs_blocks_health",
         "label": "NameNode Blocks Health",
+        "description": "This service-level alert is triggered if the number of corrupt or missing blocks exceeds the configured critical threshold.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -177,6 +183,7 @@
       {
         "name": "namenode_hdfs_capacity_utilization",
         "label": "HDFS Capacity Utilization",
+        "description": "This service-level alert is triggered if the HDFS capacity utilization exceeds the configured warning and critical thresholds. It checks the NameNode JMX Servlet for the CapacityUsed and CapacityRemaining properties.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -213,6 +220,7 @@
       {
         "name": "namenode_rpc_latency",
         "label": "NameNode RPC Latency",
+        "description": "This host-level alert is triggered if the NameNode RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for NameNode operations.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -249,6 +257,7 @@
       {
         "name": "namenode_directory_status",
         "label": "NameNode Directory Status",
+        "description": "This host-level alert is triggered if any of the the NameNode's NameDirStatuses metric reports a failed directory.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -284,6 +293,7 @@
       {
         "name": "namenode_process",
         "label": "NameNode Process",
+        "description": "This host-level alert is triggered if the NameNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -306,6 +316,7 @@
       {
         "name": "secondary_namenode_process",
         "label": "Secondary NameNode Process",
+        "description": "This host-level alert is triggered if the Secondary NameNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -328,6 +339,7 @@
       {
         "name": "journalnode_process",
         "label": "JournalNode Process",
+        "description": "This host-level alert is triggered if the JournalNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -350,6 +362,7 @@
       {
         "name": "datanode_process",
         "label": "DataNode Process",
+        "description": "This host-level alert is triggered if the individual DataNode processes cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -370,6 +383,7 @@
       {
         "name": "datanode_webui",
         "label": "DataNode Web UI",
+        "description": "This host-level alert is triggered if the DataNode Web UI is unreachable.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -397,6 +411,7 @@
       {
         "name": "datanode_storage",
         "label": "DataNode Storage",
+        "description": "This host-level alert is triggered if storage capacity if full on the DataNode. It checks the DataNode JMX Servlet for the Capacity and Remaining properties.",
         "interval": 2,
         "scope": "HOST",
         "enabled": true,

+ 3 - 0
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "hive_metastore_process",
         "label": "Hive Metastore Process",
+        "description": "This host-level alert is triggered if the Hive Metastore process cannot be determined to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -26,6 +27,7 @@
       {
         "name": "hive_server_process",
         "label": "HiveServer2 Process",
+        "description": "This host-level alert is triggered if the HiveServer cannot be determined to be up and responding to client requests.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -39,6 +41,7 @@
       {
         "name": "hive_webhcat_server_status",
         "label": "WebHCat Server Status",
+        "description": "This host-level alert is triggered if the templeton server status is not healthy.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 8 - 0
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/MAPREDUCE/alerts.json

@@ -4,6 +4,7 @@
       {
         "name": "mapreduce_tasktracker_process_percent",
         "label": "Percent TaskTrackers Available",
+        "description": "This alert is triggered if the number of down TaskTrackers in the cluster is greater than the configured critical threshold. It aggregates the results of TaskTrackers process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -30,6 +31,7 @@
       {
         "name": "mapreduce_jobtracker_webui",
         "label": "JobTracker Web UI",
+        "description": "This host-level alert is triggered if the JobTracker Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -54,6 +56,7 @@
       {
         "name": "mapreduce_jobtracker_cpu",
         "label": "JobTracker Host CPU Utilization",
+        "description": "This host-level alert is triggered if the percent of CPU utilization on the JobTracker exceeds the configured critical threshold.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -87,6 +90,7 @@
       {
         "name": "mapreduce_jobtracker_rpc_latency",
         "label": "JobTracker RPC Latency",
+        "description": "This host-level alert is triggered if the JobTracker operations RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for operations.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -120,6 +124,7 @@
       {
         "name": "mapreduce_jobtracker_process",
         "label": "JobTracker Process",
+        "description": "This host-level alert is triggered if the JobTracker process cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -142,6 +147,7 @@
       {
         "name": "mapreduce_tasktracker_process",
         "label": "TaskTracker Process",
+        "description": "This host-level alert is triggered if the TaskTracker process cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -162,6 +168,7 @@
       {
         "name": "mapreduce_local_directory_space",
         "label": "MapReduce Local Directory Space",
+        "description": "This host-level alert is triggered if the task tracker is reporting low disk space.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -175,6 +182,7 @@
       {
         "name": "mapreduce_historyserver_webui",
         "label": "History Server Web UI",
+        "description": "This host-level alert is triggered if the History Server Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 2 - 0
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/OOZIE/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "oozie_server_webui",
         "label": "Oozie Server Web UI",
+        "description": "This host-level alert is triggered if the Oozie server Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -28,6 +29,7 @@
       {
         "name": "oozie_server_status",
         "label": "Oozie Server Status",
+        "description": "This host-level alert is triggered if the Oozie server cannot be determined to be up and responding to client requests.",
         "interval": 1,
         "scope": "ANY",
         "source": {

+ 2 - 0
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/ZOOKEEPER/alerts.json

@@ -4,6 +4,7 @@
       {
         "name": "zookeeper_server_process_percent",
         "label": "Percent ZooKeeper Servers Available",
+        "description": "This alert is triggered if the number of down ZooKeeper servers in the cluster is greater than the configured critical threshold. It aggregates the results of ZooKeeper process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -30,6 +31,7 @@
       {
         "name": "zookeeper_server_process",
         "label": "ZooKeeper Server Process",
+        "description": "This host-level alert is triggered if the ZooKeeper server process cannot be determined to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "source": {

+ 1 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/FLUME/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "flume_agent_status",
         "label": "Flume Agent Status",
+        "description": "This host-level alert is triggerd if any of the expected flume agent processes are not available.",
         "interval": 1,
         "scope": "ANY",
         "source": {

+ 5 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/GANGLIA/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "ganglia_server_process",
         "label": "Ganglia Server Process",
+        "description": "This host-level alert is triggered if the Ganglia server process cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -25,6 +26,7 @@
       {
         "name": "ganglia_monitor_hdfs_namenode",
         "label": "Ganglia NameNode Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for HDFS NameNode is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -45,6 +47,7 @@
       {
         "name": "ganglia_monitor_hbase_master",
         "label": "Ganglia HBase Master Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for the HBase Master process is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -65,6 +68,7 @@
       {
         "name": "ganglia_monitor_yarn_resourcemanager",
         "label": "Ganglia ResourceManager Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for the YARN ResourceManager process is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -85,6 +89,7 @@
       {
         "name": "ganglia_monitor_mapreduce_history_server",
         "label": "Ganglia History Server Process Monitor",
+        "description": "This host-level alert is triggered if the Ganglia gmond process which handles receiving metrics for the MapReduce History Server process is not up and listening.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 4 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/alerts.json

@@ -4,6 +4,7 @@
       {
         "name": "hbase_regionserver_process_percent",
         "label": "Percent RegionServers Available",
+        "description": "This service-level alert is triggered if the configured percentage of RegionServer processes cannot be determined to be up and listening on the network for the configured warning and critical thresholds. It aggregates the results of RegionServer process down checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -30,6 +31,7 @@
       {
         "name": "hbase_master_process",
         "label": "HBase Master Process",
+        "description": "This alert is triggered if the HBase master processes cannot be confirmed to be up and listening on the network for the configured critical threshold, given in seconds.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -49,6 +51,7 @@
       {
         "name": "hbase_master_cpu",
         "label": "HBase Maser CPU Utilization",
+        "description": "This host-level alert is triggered if CPU utilization of the HBase Master exceeds certain warning and critical thresholds. It checks the HBase Master JMX Servlet for the SystemCPULoad property.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -88,6 +91,7 @@
       {
         "name": "hbase_regionserver_process",
         "label": "HBase RegionServer Process",
+        "description": "This host-level alert is triggered if the RegionServer processes cannot be confirmed to be up and listening on the network for the configured critical threshold, given in seconds.",
         "interval": 1,
         "scope": "HOST",
         "source": {

+ 18 - 18
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/alerts.json

@@ -4,7 +4,7 @@
       {
         "name": "datanode_process_percent",
         "label": "Percent DataNodes Available",
-        "description": "The percentage of DataNodes that are responding to the process check compared to those that are not.",
+        "description": "This alert is triggered if the number of down DataNodes in the cluster is greater than the configured critical threshold. It aggregates the results of DataNode process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -29,7 +29,7 @@
       {
         "name": "datanode_storage_percent",
         "label": "Percent DataNodes With Available Space",
-        "description": "The percentage of DataNodes that report they have sufficient space compared to those that are not.",
+        "description": "This service-level alert is triggered if the storage on a certain percentage of DataNodes exceeds either the warning or critical threshold values.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -54,7 +54,7 @@
       {
         "name": "journalnode_process_percent",
         "label": "Percent JournalNodes Available",
-        "description": "The percentage of JournalNodes that are responding to the process check compared to those that are not.",
+        "description": "This alert is triggered if the number of down JournalNodes in the cluster is greater than the configured critical threshold. It aggregates the results of JournalNode process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -81,7 +81,7 @@
       {
         "name": "namenode_webui",
         "label": "NameNode Web UI",
-        "description": "An HTTP-style request to the NameNode webpage to verify that it can be accessed.",
+        "description": "This host-level alert is triggered if the NameNode Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -109,7 +109,7 @@
       {
         "name": "namenode_cpu",
         "label": "NameNode Host CPU Utilization",
-        "description": "Checks that the CPU of the NameNode is below a certain load percentage.",
+        "description": "This host-level alert is triggered if CPU utilization of the NameNode exceeds certain warning and critical thresholds. It checks the NameNode JMX Servlet for the SystemCPULoad property.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -146,7 +146,7 @@
       {
         "name": "namenode_hdfs_blocks_health",
         "label": "NameNode Blocks Health",
-        "description": "Checks to see if the NameNode is reporting any blocks as missing.",
+        "description": "This service-level alert is triggered if the number of corrupt or missing blocks exceeds the configured critical threshold.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -183,7 +183,7 @@
       {
         "name": "namenode_hdfs_capacity_utilization",
         "label": "HDFS Capacity Utilization",
-        "description": "Checks the overall capacity remaining reported by the NameNode and triggers if it falls between a certain threshold",
+        "description": "This service-level alert is triggered if the HDFS capacity utilization exceeds the configured warning and critical thresholds. It checks the NameNode JMX Servlet for the CapacityUsed and CapacityRemaining properties.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -220,7 +220,7 @@
       {
         "name": "namenode_rpc_latency",
         "label": "NameNode RPC Latency",
-        "description": "Check the TCP latency reported by the NameNode.",
+        "description": "This host-level alert is triggered if the NameNode RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for NameNode operations.",
         "interval": 2,
         "scope": "ANY",
         "enabled": true,
@@ -257,7 +257,7 @@
       {
         "name": "namenode_directory_status",
         "label": "NameNode Directory Status",
-        "description": "Checks the NameNode's NameDirStatuses metric to see if any directories report a failure.",
+        "description": "This host-level alert is triggered if any of the the NameNode's NameDirStatuses metric reports a failed directory.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -293,7 +293,7 @@
       {
         "name": "namenode_process",
         "label": "NameNode Process",
-        "description": "Checks that the NameNode process responds to a TCP port request.",
+        "description": "This host-level alert is triggered if the NameNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -314,7 +314,7 @@
       {
         "name": "namenode_last_checkpoint",
         "label": "NameNode Last Checkpoint",
-        "description": "Checks the last time that the NameNode performed a checkpoint. This script will also check for the number of uncommitted transactions.",
+        "description": "This service-level alert will trigger if the last time that the NameNode performed a checkpoint was too long ago. It will also trigger if the number of uncommitted transactions is beyond a certain threshold.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -326,7 +326,7 @@
       {
         "name": "namenode_ha_health",
         "label": "NameNode High Availability Health",
-        "description": "When running in HA mode, this will check the states of both the Active and Standby NameNode to ensure that there is exactly 1 of each.",
+        "description": "This service-level alert is triggered if either the Active NameNode or Standby NameNode are not running.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -341,7 +341,7 @@
       {
         "name": "secondary_namenode_process",
         "label": "Secondary NameNode Process",
-        "description": "Checks that the Secondary NameNode process responds to a TCP port request.",
+        "description": "This host-level alert is triggered if the Secondary NameNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -364,7 +364,7 @@
       {
         "name": "journalnode_process",
         "label": "JournalNode Process",
-        "description": "Checks that the JournalNode process responds to a TCP port request.",
+        "description": "This host-level alert is triggered if the JournalNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -387,7 +387,7 @@
       {
         "name": "datanode_process",
         "label": "DataNode Process",
-        "description": "Checks that the DataNode process responds to a TCP port request.",
+        "description": "This host-level alert is triggered if the individual DataNode processes cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -408,7 +408,7 @@
       {
         "name": "datanode_webui",
         "label": "DataNode Web UI",
-        "description": "An HTTP-style request to the DataNode webpage to verify that it can be accessed.",
+        "description": "This host-level alert is triggered if the DataNode Web UI is unreachable.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -436,7 +436,7 @@
       {
         "name": "datanode_storage",
         "label": "DataNode Storage",
-        "description": "Checks the capacity remaining on a DataNode and triggers if below a certain percentage.",
+        "description": "This host-level alert is triggered if storage capacity if full on the DataNode. It checks the DataNode JMX Servlet for the Capacity and Remaining properties.",
         "interval": 2,
         "scope": "HOST",
         "enabled": true,
@@ -475,7 +475,7 @@
       {
         "name": "hdfs_zookeeper_failover_controller_process",
         "label": "ZooKeeper Failover Controller Process",
-        "description": "Checks that the ZooKeeper Failover Controller process responds to a TCP port request.",
+        "description": "This host-level alert is triggered if the ZooKeeper Failover Controller process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 3 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HIVE/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "hive_metastore_process",
         "label": "Hive Metastore Process",
+        "description": "This host-level alert is triggered if the Hive Metastore process cannot be determined to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -26,6 +27,7 @@
       {
         "name": "hive_server_process",
         "label": "HiveServer2 Process",
+        "description": "This host-level alert is triggered if the HiveServer cannot be determined to be up and responding to client requests.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -39,6 +41,7 @@
       {
         "name": "hive_webhcat_server_status",
         "label": "WebHCat Server Status",
+        "description": "This host-level alert is triggered if the templeton server status is not healthy.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 2 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/OOZIE/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "oozie_server_webui",
         "label": "Oozie Server Web UI",
+        "description": "This host-level alert is triggered if the Oozie server Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -28,6 +29,7 @@
       {
         "name": "oozie_server_status",
         "label": "Oozie Server Status",
+        "description": "This host-level alert is triggered if the Oozie server cannot be determined to be up and responding to client requests.",
         "interval": 1,
         "scope": "ANY",
         "source": {

+ 11 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "mapreduce_history_server_webui",
         "label": "History Server Web UI",
+        "description": "This host-level alert is triggered if the History Server Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -31,6 +32,7 @@
       {
         "name": "mapreduce_history_server_cpu",
         "label": "History Server CPU Utilization",
+        "description": "This host-level alert is triggered if the percent of CPU utilization on the History Server exceeds the configured critical threshold.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -67,6 +69,7 @@
       {
         "name": "mapreduce_history_server_rpc_latency",
         "label": "History Server RPC Latency",
+        "description": "This host-level alert is triggered if the History Server operations RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for operations.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -103,6 +106,7 @@
       {
         "name": "mapreduce_history_server_process",
         "label": "History Server Process",
+        "description": "This host-level alert is triggered if the History Server process cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,
@@ -127,6 +131,7 @@
       {
         "name": "yarn_nodemanager_webui_percent",
         "label": "Percent NodeManagers Available",
+        "description": "This alert is triggered if the number of down NodeManagers in the cluster is greater than the configured critical threshold. It aggregates the results of NodeManager process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -153,6 +158,7 @@
       {
         "name": "yarn_nodemanager_webui",
         "label": "NodeManager Web UI",
+        "description": "This host-level alert is triggered if the NodeManager Web UI is unreachable.",
         "interval": 1,
         "scope": "HOST",
         "source": {
@@ -180,6 +186,7 @@
       {
         "name": "yarn_nodemanager_health",
         "label": "NodeManager Health",
+        "description": "This host-level alert checks the node health property available from the NodeManager component.",
         "interval": 1,
         "scope": "HOST",
         "enabled": true,
@@ -193,6 +200,7 @@
       {
         "name": "yarn_resourcemanager_webui",
         "label": "ResourceManager Web UI",
+        "description": "This host-level alert is triggered if the ResourceManager Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -219,6 +227,7 @@
       {
         "name": "yarn_resourcemanager_cpu",
         "label": "ResourceManager CPU Utilization",
+        "description": "This host-level alert is triggered if CPU utilization of the ResourceManager exceeds certain warning and critical thresholds. It checks the ResourceManager JMX Servlet for the SystemCPULoad property.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -255,6 +264,7 @@
       {
         "name": "yarn_resourcemanager_rpc_latency",
         "label": "ResourceManager RPC Latency",
+        "description": "This host-level alert is triggered if the ResourceManager operations RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for ResourceManager operations.",
         "interval": 5,
         "scope": "ANY",
         "enabled": true,
@@ -293,6 +303,7 @@
       {
         "name": "yarn_app_timeline_server_webui",
         "label": "App Timeline Web UI",
+        "description": "This host-level alert is triggered if the App Timeline Server Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "source": {

+ 2 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/ZOOKEEPER/alerts.json

@@ -4,6 +4,7 @@
       {
         "name": "zookeeper_server_process_percent",
         "label": "Percent ZooKeeper Servers Available",
+        "description": "This alert is triggered if the number of down ZooKeeper servers in the cluster is greater than the configured critical threshold. It aggregates the results of ZooKeeper process checks.",
         "interval": 1,
         "scope": "SERVICE",
         "enabled": true,
@@ -30,6 +31,7 @@
       {
         "name": "zookeeper_server_process",
         "label": "ZooKeeper Server Process",
+        "description": "This host-level alert is triggered if the ZooKeeper server process cannot be determined to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "source": {

+ 2 - 0
ambari-server/src/main/resources/stacks/HDP/2.1/services/FALCON/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "falcon_server_process",
         "label": "Falcon Server Process",
+        "description": "This host-level alert is triggered if the individual Falcon server process cannot be established to be up and listening on the network.",
         "interval": 1,
         "scope": "ANY",
         "source": {
@@ -24,6 +25,7 @@
       {
         "name": "falcon_server_webui",
         "label": "Falcon Server Web UI",
+        "description": "This host-level alert is triggered if the Falcon Server Web UI is unreachable.",
         "interval": 1,
         "scope": "ANY",
         "enabled": true,

+ 1 - 0
ambari-server/src/main/resources/stacks/HDP/2.2/services/KAFKA/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "kafka_broker_process",
         "label": "Kafka Broker Process",
+        "description": "This host-level alert is triggered if the Kafka Broker cannot be determined to be up.",
         "interval": 1,
         "scope": "HOST",
         "source": {

+ 1 - 0
ambari-server/src/main/resources/stacks/HDP/2.2/services/KNOX/alerts.json

@@ -5,6 +5,7 @@
       {
         "name": "knox_gateway_process",
         "label": "Know Gateway Process",
+        "description": "This host-level alert is triggered if the Knox Gateway cannot be determined to be up.",
         "interval": 1,
         "scope": "HOST",
         "source": {